PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/med_mcqa_scenario.py ADDED Viewed

@@ -0,0 +1,102 @@
+import json
+import os
+from typing import Dict, List
+from helm.common.general import ensure_file_downloaded
+from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, Input, Output
+class MedMCQAScenario(Scenario):
+    """
+    From "MedMCQA: A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering"
+    (Pal et al.), MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address
+    real-world medical entrance exam questions." The dataset "...has more than 194k high-quality AIIMS & NEET PG
+    entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average
+    token length of 12.77 and high topical diversity."
+    The following is an example from the dataset:
+    Question: In a patient of heart disease antibiotic prophylaxis for dental extraction is:
+    A. Amoxicillin.
+    B. Imipenem.
+    C. Gentamicin.
+    D. Erythromycin.
+    Answer: A
+    Paper: https://arxiv.org/abs/2203.14371
+    Code: https://github.com/MedMCQA/MedMCQA
+    @InProceedings{pmlr-v174-pal22a,
+      title = 	  {MedMCQA: A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering},
+      author =    {Pal, Ankit and Umapathi, Logesh Kumar and Sankarasubbu, Malaikannan},
+      booktitle = {Proceedings of the Conference on Health, Inference, and Learning},
+      pages = 	 {248--260},
+      year = 	 {2022},
+      editor = 	 {Flores, Gerardo and Chen, George H and Pollard, Tom and Ho, Joyce C and Naumann, Tristan},
+      volume = 	 {174},
+      series = 	 {Proceedings of Machine Learning Research},
+      month = 	 {07--08 Apr},
+      publisher =    {PMLR},
+      pdf = 	 {https://proceedings.mlr.press/v174/pal22a/pal22a.pdf},
+      url = 	 {https://proceedings.mlr.press/v174/pal22a.html},
+      abstract = {This paper introduces MedMCQA, a new large-scale, Multiple-Choice Question Answering (MCQA) dataset
+      designed to address real-world medical entrance exam questions. More than 194k high-quality AIIMS & NEET PG
+      entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average token
+      length of 12.77 and high topical diversity. Each sample contains a question, correct answer(s), and other
+      options which requires a deeper language understanding as it tests the 10+ reasoning abilities of a model across
+      a wide range of medical subjects & topics. A detailed explanation of the solution, along with the above
+      information, is provided in this study.}
+    }
+    """
+    # From https://github.com/MedMCQA/MedMCQA#data-fields, there are four possible answer choices
+    # where "cop" corresponds to the index of the correct option.
+    ANSWER_OPTION_TO_INDEX: Dict[str, int] = {"opa": 1, "opb": 2, "opc": 3, "opd": 4}
+    DATASET_DOWNLOAD_URL: str = (
+        "https://drive.google.com/uc?export=download&id=15VkJdq5eyWIkfb_aoD3oS8i4tScbHYky&confirm=t"
+    )
+    name = "med_mcqa"
+    description = (
+        "MedMCQA is a multiple-choice question answering (MCQA) dataset designed to address "
+        "real-world medical entrance exam questions."
+    )
+    tags = ["question_answering", "biomedical"]
+    def get_instances(self) -> List[Instance]:
+        data_path: str = os.path.join(self.output_path, "data")
+        ensure_file_downloaded(
+            source_url=self.DATASET_DOWNLOAD_URL,
+            target_path=data_path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+        instances: List[Instance] = []
+        # From https://github.com/MedMCQA/MedMCQA#model-submission-and-test-set-evaluation,
+        # "to preserve the integrity of test results, we do not release the test set's ground-truth to the public".
+        for split in [TRAIN_SPLIT, VALID_SPLIT]:
+            # Although the files end with ".json", they are actually JSONL files
+            split_file_name: str = f"{'dev' if split == VALID_SPLIT else split}.json"
+            split_path: str = os.path.join(data_path, split_file_name)
+            with open(split_path, "r") as f:
+                for line in f:
+                    # The data fields and their explanations can be found here:
+                    # https://github.com/MedMCQA/MedMCQA#data-fields
+                    example: Dict[str, str] = json.loads(line.rstrip())
+                    references: List[Reference] = [
+                        # Value of "cop" corresponds to the index of the correct option
+                        Reference(Output(text=example[option]), tags=[CORRECT_TAG] if index == example["cop"] else [])
+                        for option, index in MedMCQAScenario.ANSWER_OPTION_TO_INDEX.items()
+                    ]
+                    instance: Instance = Instance(
+                        input=Input(text=example["question"]),
+                        references=references,
+                        split=split,
+                    )
+                    instances.append(instance)
+        return instances

helm/benchmark/scenarios/med_paragraph_simplification_scenario.py ADDED Viewed

@@ -0,0 +1,119 @@
+import os
+from typing import List
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
+class MedParagraphSimplificationScenario(Scenario):
+    """
+    "Paragraph-level Simplification of Medical Texts" (Devaraj et al.) studies the problem of learning to simplify
+    medical texts. One of their contributions is a new corpus that is composed of technical abstracts and their
+    lay summaries on various clinical topics.
+    The author generated train/val/test splits, which are available in the GitHub repository linked in the paper.
+    The following is an example from the dataset:
+    {
+        "doi": "10.1002/14651858.CD011112.pub2",
+        "abstract": "We included six studies (reported as seven papers) involving 326 participants whose ages ranged
+        from 39 to 83 years, with a gender bias towards men (73% to 95% across studies), reflecting the characteristics
+        of patients with HNC. The risk of bias in the studies was generally high. We did not pool data from studies
+        because of significant differences in the interventions and outcomes evaluated. We found a lack of
+        standardisation and consistency in the outcomes measured and the endpoints at which they were evaluated.
+        We found no evidence that therapeutic exercises were better than TAU, or any other treatment, in improving the
+        safety and efficiency of oral swallowing (our primary outcome) or in improving any of the secondary outcomes.
+        Using the GRADE system, we classified the overall quality of the evidence for each outcome as very low, due to
+        the limited number of trials and their low quality. There were no adverse events reported that were directly
+        attributable to the intervention (swallowing exercises). We found no evidence that undertaking therapeutic
+        exercises before, during and/or immediately after HNC treatment leads to improvement in oral swallowing. This
+        absence of evidence may be due to the small participant numbers in trials, resulting in insufficient power to
+        detect any difference. Data from the identified trials could not be combined due to differences in the choice
+        of primary outcomes and in the measurement tools used to assess them, and the differing baseline and endpoints
+        across studies. Designing and implementing studies with stronger methodological rigour is essential. There needs
+        to be agreement about the key primary outcomes, the choice of validated assessment tools to measure them and the
+        time points at which those measurements are made.",
+        "pls": "We included six studies with 326 participants who undertook therapeutic exercises before, during and/or
+        after HNC treatment. We could not combine the results of the studies because of the variation in participants'
+        cancers, their treatments, the outcomes measured and the tools used to assess them, as well as the differing
+        time points for testing. Researchers have compared: (i) therapeutic exercises versus treatment as usual (TAU);
+        (ii) therapeutic exercises versus sham therapy; (iii) therapeutic exercises plus TAU versus TAU. The therapeutic
+        exercises varied in their design, timing and intensity. TAU involved managing patients' dysphagia when it
+        occurred, including inserting a tube for non-oral feeding. The evidence is up to date to 1 July 2016. We found
+        no evidence that therapeutic exercises were better than TAU, or any other treatment, in improving the safety and
+        efficiency of oral swallowing (our primary outcome) or in improving any of the secondary outcomes. However,
+        there is insufficient evidence to draw any clear conclusion about the effects of undertaking therapeutic
+        exercises before during and/or immediately after HNC treatment on preventing or reducing dysphagia. Studies had
+        small participant numbers, used complex interventions and varied in the choice of outcomes measured, making it
+        difficult to draw reliable conclusions. There were no reported adverse events directly attributable to the
+        intervention (swallowing exercises). The current quality of the evidence to support the use of therapeutic
+        exercises before, during and/or immediately after HNC treatment to prevent/reduce dysphagia is very low. We need
+        better designed, rigorous studies with larger participant numbers and agreed endpoints and outcome measurements
+        in order to draw clear(er) conclusions."
+    },
+    where "pls" stands for "plain-language summary".
+    Paper: http://arxiv.org/abs/2104.05767
+    Code: https://github.com/AshOlogn/Paragraph-level-Simplification-of-Medical-Texts
+    @inproceedings{devaraj-etal-2021-paragraph,
+        title = "Paragraph-level Simplification of Medical Texts",
+        author = "Devaraj, Ashwin and Marshall, Iain and Wallace, Byron and Li, Junyi Jessy",
+        booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for
+                     Computational Linguistics",
+        month = jun,
+        year = "2021",
+        publisher = "Association for Computational Linguistics",
+        url = "https://www.aclweb.org/anthology/2021.naacl-main.395",
+        pages = "4972--4984",
+    }
+    """
+    DOWNLOAD_URL_TEMPLATE: str = (
+        "https://raw.githubusercontent.com/AshOlogn/Paragraph-level-Simplification-of-Medical-Texts/"
+        "main/data/data-1024/{file_name}"
+    )
+    name = "med_paragraph_simplification"
+    description = "Corpus with technical abstracts and their lay summaries on various clinical topics"
+    tags = ["summarization", "biomedical"]
+    def get_instances(self) -> List[Instance]:
+        data_path: str = os.path.join(self.output_path, "data")
+        ensure_directory_exists(data_path)
+        instances: List[Instance] = []
+        for split in ALL_SPLITS:
+            # Original abstracts
+            abstract_file_name: str = f"{'val' if split == VALID_SPLIT else split}.source"
+            abstract_path: str = os.path.join(data_path, abstract_file_name)
+            ensure_file_downloaded(
+                source_url=MedParagraphSimplificationScenario.DOWNLOAD_URL_TEMPLATE.format(
+                    file_name=abstract_file_name
+                ),
+                target_path=abstract_path,
+            )
+            # Plain-language summaries of the abstracts
+            pls_file_name: str = f"{'val' if split == VALID_SPLIT else split}.target"
+            pls_path: str = os.path.join(data_path, pls_file_name)
+            ensure_file_downloaded(
+                source_url=MedParagraphSimplificationScenario.DOWNLOAD_URL_TEMPLATE.format(file_name=pls_file_name),
+                target_path=pls_path,
+            )
+            with open(abstract_path, "r") as abstract_file:
+                with open(pls_path, "r") as pls_file:
+                    for abstract_line, summary_line in zip(abstract_file, pls_file):
+                        abstract: str = abstract_line.rstrip()
+                        summary: str = summary_line.rstrip()
+                        instance: Instance = Instance(
+                            input=Input(text=abstract),
+                            references=[Reference(Output(text=summary), tags=[CORRECT_TAG])],
+                            split=split,
+                        )
+                        instances.append(instance)
+        return instances

helm/benchmark/scenarios/med_qa_scenario.py ADDED Viewed

@@ -0,0 +1,96 @@
+import json
+import os
+from typing import Dict, List
+from helm.common.general import ensure_file_downloaded
+from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
+class MedQAScenario(Scenario):
+    """
+    From "What Disease Does This Patient Have? A Large-Scale Open Domain Question Answering Dataset from Medical Exams"
+    (Jin et al.), MedQA is an open domain question answering dataset composed of questions from professional medical
+    board exams.
+    From Jin et al., "to comply with fair use of law ,we shuffle the order of answer options and randomly delete
+    one of the wrong options for each question for USMLE and MCMLE datasets, which results in four options with one
+    right option and three wrong options".
+    We use the 4-options, English subset ("US") of the dataset, which contains 12,723 questions.
+    The following is an example from the dataset:
+    {
+      "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states
+      it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She
+      otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C),
+      blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air.
+      Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the
+      following is the best treatment for this patient?",
+      "answer": "Nitrofurantoin",
+      "options": {
+        "A": "Ampicillin",
+        "B": "Ceftriaxone",
+        "C": "Ciprofloxacin",
+        "D": "Doxycycline",
+        "E": "Nitrofurantoin"
+      },
+      "meta_info": "step2&3",
+      "answer_idx": "E"
+    }
+    Paper: https://arxiv.org/abs/2009.13081
+    Code: https://github.com/jind11/MedQA
+    @article{jin2020disease,
+      title={What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset
+             from Medical Exams},
+      author={Jin, Di and Pan, Eileen and Oufattole, Nassim and Weng, Wei-Hung and Fang, Hanyi and Szolovits, Peter},
+      journal={arXiv preprint arXiv:2009.13081},
+      year={2020}
+    }
+    """
+    DATASET_DOWNLOAD_URL: str = (
+        "https://drive.google.com/uc?export=download&id=1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw&confirm=t"
+    )
+    name = "med_qa"
+    description = "An open domain question answering (QA) dataset collected from professional medical board exams."
+    tags = ["question_answering", "biomedical"]
+    def get_instances(self) -> List[Instance]:
+        data_path: str = os.path.join(self.output_path, "data")
+        ensure_file_downloaded(
+            source_url=self.DATASET_DOWNLOAD_URL,
+            target_path=data_path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+        instances: List[Instance] = []
+        for split in ALL_SPLITS:
+            split_file_name: str = f"phrases_no_exclude_{'dev' if split == VALID_SPLIT else split}.jsonl"
+            split_path: str = os.path.join(data_path, "data_clean", "questions", "US", "4_options", split_file_name)
+            with open(split_path, "r") as f:
+                for line in f:
+                    example: Dict = json.loads(line.rstrip())
+                    assert len(example["options"]) == 4, f"Expected 4 possible answer choices: {example['options']}"
+                    references: List[Reference] = [
+                        # Some answers have extraneous characters e.g., 'Pulmonary embolism\n"'.
+                        # Filed an issue: https://github.com/jind11/MedQA/issues/5
+                        # Value of "answer_idx" corresponds to the letter of the correct answer.
+                        Reference(
+                            Output(text=answer.rstrip('"').rstrip()),
+                            tags=[CORRECT_TAG] if index == example["answer_idx"] else [],
+                        )
+                        for index, answer in example["options"].items()
+                    ]
+                    instance: Instance = Instance(
+                        input=Input(text=example["question"]),
+                        references=references,
+                        split=split,
+                    )
+                    instances.append(instance)
+        return instances

helm/benchmark/scenarios/opinions_qa_scenario.py ADDED Viewed

@@ -0,0 +1,194 @@
+import os
+import pandas as pd
+from typing import List, Dict
+from helm.common.general import ensure_file_downloaded
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Input,
+    Output,
+)
+class OpinionsQAScenario(Scenario):
+    """
+    The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
+    [Santurkar et al., 2023].
+    OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently
+    subjective, there isn't a single ground truth response. Instead, the object of interest is how
+    the distribution of model responses compares to those obtained from human survey participants.
+    As discussed in Santurkar et al., we consider prompting an LM:
+    1. Without any context (zero-shot) to evaluate the "default" opinions reflected
+        by it.
+    2. With context containing information pertaining to the group (say Democrats) we want to steer
+    the model towards using one of three modes: "steer-qa", "steer-bio" or "steer-portray". See the
+    example below for an illustration fo the difference between the three steering modes.
+    We prompt models using the following format
+        <optional context>
+        Question: <question>
+        A. <reference>
+        B. <reference>
+        C. <reference>
+        D. <reference>
+        Answer: <A/B/C/D>
+    For instance:
+        <optional context>
+        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
+        to gun violence in the country today?
+        A. A great deal
+        B. A fair amount
+        C. Not too much
+        D. Not at all
+        E. Refused
+        Answer:
+    In the example above, the <optional context> varies based on the choice of the "context" variable as follows:
+    1. If context="default", <optional context> is not used and the model is prompted with the question directly.
+    2. If context="steer-qa", the group information (to steer the model towards) is passed in a QA format, e.g.:
+        Question: In politics today, do you consider yourself a
+        A. Republican
+        B. Democrat
+        C. Independent
+        D. Something else
+        E. Refused
+        Answer: B
+    3. If context="steer-bio", the group information is passed to the model as a free-text response
+    to a prompt asking for demographic information, e.g.:
+    Below you will be asked to provide a short description of your political affiliation and
+    then answer some questions.
+    Description: In politics today, I consider myself a Democrat.
+    4. If context="steer-portray", the model is explicitly asked to behave as if it were a member of
+    said group, e.g.:
+    Answer the following question as if in politics today, you considered yourself a Democrat.
+    """
+    name = "opinions_qa"
+    description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
+    tags = ["multiple_choice"]
+    """ Information needed to download the dataset """
+    CODALAB_URI_TEMPLATE: str = (
+        "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
+    )
+    CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
+    FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
+    PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
+    def __init__(self, survey_type: str, context: str):
+        super().__init__()
+        assert context in ["default", "steer-qa", "steer-bio", "steer-portray"]
+        self.survey_type: str = survey_type
+        self.context: str = context
+    def download_data(self):
+        self.output_path: str = os.path.join(self.output_path, "data")
+        if not os.path.exists(self.output_path):
+            os.makedirs(self.output_path)
+        DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
+        DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
+        DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
+        for filename in DOWNLOAD_FILENAMES:
+            data_path: str = os.path.join(self.output_path, filename)
+            source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
+            ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
+    def read_survey_questions(self, csv_path):
+        df = pd.read_csv(csv_path, sep="\t")
+        df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
+        return df
+    def get_instances(self) -> List[Instance]:
+        self.download_data()
+        # Read all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "dev": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
+        csv_dict = {
+            "dev": os.path.join(self.output_path, f"{self.context}.csv"),
+            "test": os.path.join(self.output_path, f"{self.survey_type}.csv"),
+        }
+        bios_df = None
+        if self.context in ["steer-bio", "steer-portray"]:
+            bios_path = os.path.join(self.output_path, f"{self.context}.csv")
+            bios_df = pd.read_csv(bios_path, sep="\t")
+        for split in all_splits:
+            csv_path: str = csv_dict[split]
+            assert os.path.exists(csv_path)
+            question_df = self.read_survey_questions(csv_path)
+            for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
+                # Opinions QA test questions have no correct answer and thus we set it to be None by default
+                # for all test instances.
+                # In the case where context = steer-qa, we add demographic information in the form of a
+                # in-context question answer pair as shown in the example above.
+                correct_answer = None if split == "test" else question_df["correct"][qidx]
+                def answer_to_reference(answer: str) -> Reference:
+                    return Reference(
+                        Output(text=answer),
+                        tags=[CORRECT_TAG] if (answer == correct_answer and split != "test") else [],
+                    )
+                if bios_df is None:
+                    # context = "default" or "steer-qa"
+                    instance = Instance(
+                        Input(text=question),
+                        references=list(map(answer_to_reference, answers)),
+                        split=splits[split],
+                    )
+                    instances.append(instance)
+                else:
+                    # context = "steer-bio"or "steer-portray"
+                    for bio in bios_df["question"].values:
+                        context = PassageQuestionInput(passage=bio, question=question + "\n")
+                        instance = Instance(
+                            context,
+                            references=list(map(answer_to_reference, answers)),
+                            split=splits[split],
+                        )
+                        instances.append(instance)
+        return instances

helm/benchmark/scenarios/scenario.py CHANGED Viewed

@@ -147,6 +147,11 @@ class Instance:
                 return reference
         return None
+    @property
+    def all_correct_references(self) -> List[Reference]:
+        """Return all correct references."""
+        return [reference for reference in self.references if reference.is_correct]
     def render_lines(self) -> List[str]:
         info = [f"input: {format_text(self.input.text)}"]
         if self.sub_split:

helm/benchmark/scenarios/the_pile_scenario.py CHANGED Viewed

@@ -78,7 +78,7 @@ class ThePileScenario(Scenario):
         # Download the raw data
         data_path = os.path.join(self.output_path, "data")
         ensure_file_downloaded(
-            source_url="https://mystic.the-eye.eu/public/AI/pile/test.jsonl.zst",
+            source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
             target_path=data_path,
             unpack=True,
         )

helm/benchmark/scenarios/wmt_14_scenario.py ADDED Viewed

@@ -0,0 +1,96 @@
+from typing import List, Any
+from datasets import load_dataset
+from helm.common.hierarchical_logger import hlog
+from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+MAX_TRAIN_INSTANCES = 20_000  # This is arbitrary, but 20,000 training examples should be enough.
+class WMT14Scenario(Scenario):
+    """
+    The 2014 Workshop on Statistical Machine Translation:
+    https://aclanthology.org/W14-3302.pdf
+    The scenario consists of 5 subsets, each of which is a parallel corpus between English and another language. The
+    non-English languages include Czech, German, French, Hindi, and Russian.
+    For each language pair, the validation and test set each includes around 3,000 examples, while the training set is
+    usually much larger. We therefore randomly downsample the training set to speedup data processing.
+    Task prompt structure:
+        Translate {source_language} to {target_language}:
+        {Hypothesis} = {Reference}
+    Example from WMT14 Fr-En:
+        Hypothesis: Assemblée générale
+        Reference: General Assembly
+    """
+    name = "WMT_14"
+    description = "Scenario for the 2014 Workshop on Statistical Machine Translation"
+    tags = ["machine_translation"]
+    def __init__(self, source_language, target_language):
+        super().__init__()
+        valid_languages = set(["cs", "de", "fr", "hi", "ru", "en"])
+        self.source_language = source_language
+        self.target_language = target_language
+        if self.source_language not in valid_languages or self.target_language not in valid_languages:
+            raise ValueError("WMT14 only includes the following languages: cs, de, fr, hi, ru, en.")
+        if self.source_language == self.target_language:
+            raise ValueError("The source language and the target language should be different.")
+        if self.source_language != "en" and self.target_language != "en":
+            raise ValueError("One of the languages should be English.")
+    def _deduplicate(self, dataset: List):
+        """
+        Remove instances in the dataset with the same label.
+        """
+        deduplicated_dataset = []
+        seen_labels = set()
+        for example in dataset:
+            if example[self.target_language] not in seen_labels:
+                seen_labels.add(example[self.target_language])
+                deduplicated_dataset.append(example)
+        return deduplicated_dataset
+    def get_instances(self) -> List[Instance]:
+        hlog("Loading the HuggingFace dataset. The first time could take several minutes.")
+        subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
+        hf_dataset: Any = load_dataset("wmt14", subset_name)
+        splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        instances: List[Instance] = []
+        hlog("Generating instances")
+        # Some training sets are too large, so we will only take a random subset of it.
+        hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
+        hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
+        for example in hf_dataset["train"]["translation"]:
+            source_sentence: str = example[self.source_language]
+            target_sentence: str = example[self.target_language]
+            instances.append(
+                Instance(
+                    input=Input(text=source_sentence),
+                    references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
+                    split="train",
+                )
+            )
+        # No special handling needed for validation or test.
+        for split_name in ["validation", "test"]:
+            split = splits[split_name]
+            for example in hf_dataset[split_name]:
+                source_sentence = example["translation"][self.source_language]
+                target_sentence = example["translation"][self.target_language]
+                instances.append(
+                    Instance(
+                        input=Input(text=source_sentence),
+                        references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
+                        split=split,
+                    )
+                )
+        return instances

helm/benchmark/static/benchmarking.css CHANGED Viewed

@@ -136,3 +136,17 @@ tbody .table-sort-column {
   background-color: #f5f5f5;
   white-space: pre-wrap;
 }
+.plot {
+  margin: 15px;
+}
+.plot img {
+  margin: 10px;
+}
+.plot-caption {
+  color: #555;
+  font-style: italic;
+  margin: 5px;
+}

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl