PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/lextreme_scenario.py ADDED Viewed

@@ -0,0 +1,458 @@
+import ast
+import random
+from pathlib import Path
+from typing import List, Any
+import datasets
+from datasets import load_dataset
+from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Output, Input
+class TaskType:
+    SLTC = "SingleLabelTextClassification"
+    MLTC = "MultiLabelTextClassification"
+    NER = "NamedEntityRecognition"
+    QA = "QuestionAnswering"
+BRAZILIAN_COURT_DECISIONS_JUDGMENT = "brazilian_court_decisions_judgment"
+BRAZILIAN_COURT_DECISIONS_UNANIMITY = "brazilian_court_decisions_unanimity"
+GERMAN_ARGUMENT_MINING = "german_argument_mining"
+GREEK_LEGAL_CODE_CHAPTER = "greek_legal_code_chapter"
+GREEK_LEGAL_CODE_SUBJECT = "greek_legal_code_subject"
+GREEK_LEGAL_CODE_VOLUME = "greek_legal_code_volume"
+SWISS_JUDGMENT_PREDICTION = "swiss_judgment_prediction"
+ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS = "online_terms_of_service_unfairness_levels"
+ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS = "online_terms_of_service_clause_topics"
+COVID19_EMERGENCY_EVENT = "covid19_emergency_event"
+MULTI_EURLEX_LEVEL_1 = "multi_eurlex_level_1"
+MULTI_EURLEX_LEVEL_2 = "multi_eurlex_level_2"
+MULTI_EURLEX_LEVEL_3 = "multi_eurlex_level_3"
+GREEK_LEGAL_NER = "greek_legal_ner"
+LEGALNERO = "legalnero"
+LENER_BR = "lener_br"
+MAPA_COARSE = "mapa_coarse"
+MAPA_FINE = "mapa_fine"
+TASK_CODE_MAPPING = {
+    BRAZILIAN_COURT_DECISIONS_JUDGMENT: TaskType.SLTC,
+    BRAZILIAN_COURT_DECISIONS_UNANIMITY: TaskType.SLTC,
+    GERMAN_ARGUMENT_MINING: TaskType.SLTC,
+    GREEK_LEGAL_CODE_CHAPTER: TaskType.SLTC,
+    GREEK_LEGAL_CODE_SUBJECT: TaskType.SLTC,
+    GREEK_LEGAL_CODE_VOLUME: TaskType.SLTC,
+    SWISS_JUDGMENT_PREDICTION: TaskType.SLTC,
+    ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: TaskType.SLTC,
+    ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: TaskType.MLTC,
+    COVID19_EMERGENCY_EVENT: TaskType.MLTC,
+    MULTI_EURLEX_LEVEL_1: TaskType.MLTC,
+    MULTI_EURLEX_LEVEL_2: TaskType.MLTC,
+    MULTI_EURLEX_LEVEL_3: TaskType.MLTC,
+    GREEK_LEGAL_NER: TaskType.NER,
+    LEGALNERO: TaskType.NER,
+    LENER_BR: TaskType.NER,
+    MAPA_COARSE: TaskType.NER,
+    MAPA_FINE: TaskType.NER,
+}
+def get_lextreme_task_type(subset):
+    return TASK_CODE_MAPPING[subset]
+TASK_MAX_TRAIN_INSTANCES_MAPPING = {
+    BRAZILIAN_COURT_DECISIONS_JUDGMENT: 4,  # ~ max 1024 tokens
+    BRAZILIAN_COURT_DECISIONS_UNANIMITY: 4,  # ~ max 1024 tokens
+    GERMAN_ARGUMENT_MINING: 5,  # ~ max 256 tokens
+    GREEK_LEGAL_CODE_CHAPTER: 1,  # ~ max 4096 tokens
+    GREEK_LEGAL_CODE_SUBJECT: 1,  # ~ max 4096 tokens
+    GREEK_LEGAL_CODE_VOLUME: 1,  # ~ max 4096 tokens
+    SWISS_JUDGMENT_PREDICTION: 2,  # ~ max 2048 tokens
+    ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: 5,  # ~ max 256 tokens
+    ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: 5,  # ~ max 256 tokens
+    COVID19_EMERGENCY_EVENT: 5,  # ~ max 256 tokens
+    MULTI_EURLEX_LEVEL_1: 1,  # ~ max 4096 tokens
+    MULTI_EURLEX_LEVEL_2: 1,  # ~ max 4096 tokens
+    MULTI_EURLEX_LEVEL_3: 1,  # ~ max 4096 tokens
+    GREEK_LEGAL_NER: 5,  # ~ max 512 tokens
+    LEGALNERO: 5,  # ~ max 512 tokens
+    LENER_BR: 5,  # ~ max 512 tokens
+    MAPA_COARSE: 5,  # ~ max 512 tokens
+    MAPA_FINE: 5,  # ~ max 512 tokens
+}
+def get_lextreme_max_train_instances(subset):
+    return TASK_MAX_TRAIN_INSTANCES_MAPPING[subset]
+TASK_MAX_TOKENS_MAPPING = {
+    BRAZILIAN_COURT_DECISIONS_JUDGMENT: 5,  # one word
+    BRAZILIAN_COURT_DECISIONS_UNANIMITY: 5,  # one word
+    GERMAN_ARGUMENT_MINING: 5,  # one word
+    GREEK_LEGAL_CODE_CHAPTER: 20,  # few non-ASCII words
+    GREEK_LEGAL_CODE_SUBJECT: 20,  # few non-ASCII words
+    GREEK_LEGAL_CODE_VOLUME: 20,  # few non-ASCII words
+    SWISS_JUDGMENT_PREDICTION: 5,  # one word
+    ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: 10,  # two words
+    ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: 10,  # max two words
+    COVID19_EMERGENCY_EVENT: 10,  # max two words
+    MULTI_EURLEX_LEVEL_1: 10,  # max two words
+    MULTI_EURLEX_LEVEL_2: 10,  # max two words
+    MULTI_EURLEX_LEVEL_3: 10,  # max two words
+    GREEK_LEGAL_NER: 430,  # num NER labels: max 2593, 99% 215, 95% 101 ==> 215 * 2 = 430
+    LEGALNERO: 788,  # num NER labels: max 737, 99% 394, 95% 103 ==> 394 * 2 = 788
+    LENER_BR: 338,  # num NER labels: max 654, 99% 169, 95% 100 ==> 169 * 2 = 338
+    MAPA_COARSE: 274,  # num NER labels: max 367, 99% 137, 95% 83 ==> 137 * 2 = 274
+    MAPA_FINE: 274,  # num NER labels: max 367, 99% 137, 95% 83 ==> 137 * 2 = 274
+}
+def get_lextreme_max_tokens(subset):
+    return TASK_MAX_TOKENS_MAPPING[subset]
+INSTRUCTIONS = {
+    BRAZILIAN_COURT_DECISIONS_JUDGMENT: "In this task, you are given the case description "
+    "from a decision heard at the State Supreme Court of Alagoas (Brazil). "
+    "Predict the judgment of the case "
+    "(no: The appeal was denied, "
+    "partial: For partially favourable decisions, "
+    "yes: For fully favourable decisions)",
+    BRAZILIAN_COURT_DECISIONS_UNANIMITY: "In this task, you are given the case description "
+    "from a decision heard at the State Supreme Court of Alagoas (Brazil). "
+    "Predict the unanimity of the case (unanimity, not-unanimity, not_determined)",
+    GERMAN_ARGUMENT_MINING: "In this task, you are given sentences from German court decisions. "
+    "Predict the major component of German Urteilsstil "
+    "(conclusion: Overall result, "
+    "definition: Abstract legal facts and consequences, "
+    "subsumption: Determination sentence / Concrete facts, "
+    "other: Anything else)",
+    GREEK_LEGAL_CODE_CHAPTER: "In this task, you are given a Greek legislative document. "
+    "Predict the chapter level category of the "
+    "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
+    GREEK_LEGAL_CODE_SUBJECT: "In this task, you are given a Greek legislative document. "
+    "Predict the subject level category of the "
+    "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
+    GREEK_LEGAL_CODE_VOLUME: "In this task, you are given a Greek legislative document. "
+    "Predict the volume level category of the "
+    "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
+    SWISS_JUDGMENT_PREDICTION: "In this task, you are given the facts description "
+    "from a decision heard at the Swiss Federal Supreme Court. "
+    "Predict the judgment of the case (approval: The appeal was approved, or dismissal: The appeal was denied)",
+    ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: "In this task, you are given a sentence "
+    "from a Terms of Service (ToS) document. "
+    "Predict the unfairness level of the sentence (potentially_unfair, clearly_unfair, clearly_fair, untagged)",
+    ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: "In this task, you are given a sentence "
+    "from a Terms of Service (ToS) document. "
+    "Predict the clause topics of the sentence out of the following: "
+    "0: Arbitration, "
+    "1: Unilateral change, "
+    "2: Content removal, "
+    "3: Jurisdiction, "
+    "4: Choice of law, "
+    "5: Limitation of liability, "
+    "6: Unilateral termination, "
+    "7: Contract by using, "
+    "8: Privacy included. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
+    COVID19_EMERGENCY_EVENT: "In this task, you are given a sentence from a European legislative document. "
+    "Predict the applicable measurements against COVID-19 out of the following: "
+    "0: State of Emergency, "
+    "1: Restrictions of fundamental rights and civil liberties, "
+    "2: Restrictions of daily liberties, "
+    "3: Closures / lockdown, "
+    "4: Suspension of international cooperation and commitments, "
+    "5: Police mobilization, "
+    "6: Army mobilization, "
+    "7: Government oversight. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
+    MULTI_EURLEX_LEVEL_1: "In this task, you are given a document from an EU law. "
+    "Predict the level 1 concept in the EUROVOC taxonomy. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
+    MULTI_EURLEX_LEVEL_2: "In this task, you are given a document from an EU law. "
+    "Predict the level 2 concept in the EUROVOC taxonomy. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
+    MULTI_EURLEX_LEVEL_3: "In this task, you are given a document from an EU law. "
+    "Predict the level 3 concept in the EUROVOC taxonomy. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
+    GREEK_LEGAL_NER: "In this task, you are given a sentence from Greek legislation. "
+    "Predict the named entity type for each token out of the following: "
+    "O, B-ORG, I-ORG, B-GPE, I-GPE, B-LEG-REFS, I-LEG-REFS, B-PUBLIC-DOCS, I-PUBLIC-DOCS, B-PERSON, I-PERSON, "
+    "B-FACILITY, I-FACILITY, B-LOCATION-UNK, I-LOCATION-UNK, B-LOCATION-NAT, I-LOCATION-NAT",
+    LEGALNERO: "In this task, you are given a sentence from Romanian legislation. "
+    "Predict the named entity type for each token out of the following: "
+    "O, B-TIME, I-TIME, B-LEGAL, I-LEGAL, B-ORG, I-ORG, B-LOC, I-LOC, B-PER, I-PER",
+    LENER_BR: "In this task, you are given a sentence "
+    "from Brazilian legal documents (court decisions and legislation). "
+    "Predict the named entity type for each token out of the following: "
+    "O, B-ORGANIZACAO, I-ORGANIZACAO, B-PESSOA, I-PESSOA, B-TEMPO, I-TEMPO, B-LOCAL, I-LOCAL, "
+    "B-LEGISLACAO, I-LEGISLACAO, B-JURISPRUDENCIA, I-JURISPRUDENCIA",
+    MAPA_COARSE: "In this task, you are given a sentence from the EUR-Lex database. "
+    "Predict the coarse grained named entity type for each token out of the following: "
+    "O, B-ORGANISATION, I-ORGANISATION, B-ADDRESS, I-ADDRESS, B-DATE, I-DATE, "
+    "B-PERSON, I-PERSON, B-AMOUNT, I-AMOUNT, B-TIME, I-TIME",
+    MAPA_FINE: "In this task, you are given a sentence from the EUR-Lex database. "
+    "Predict the fine grained named entity type for each token out of the following: "
+    "O, B-BUILDING, I-BUILDING, B-CITY, I-CITY, B-COUNTRY, I-COUNTRY, B-PLACE, I-PLACE, B-TERRITORY, I-TERRITORY, "
+    "I-UNIT, B-UNIT, B-VALUE, I-VALUE, B-YEAR, I-YEAR, B-STANDARD ABBREVIATION, I-STANDARD ABBREVIATION, "
+    "B-MONTH, I-MONTH, B-DAY, I-DAY, B-AGE, I-AGE, B-ETHNIC CATEGORY, I-ETHNIC CATEGORY, B-FAMILY NAME, I-FAMILY NAME, "
+    "B-INITIAL NAME, I-INITIAL NAME, B-MARITAL STATUS, I-MARITAL STATUS, B-PROFESSION, I-PROFESSION, B-ROLE, I-ROLE, "
+    "B-NATIONALITY, I-NATIONALITY, B-TITLE, I-TITLE, B-URL, I-URL, B-TYPE, I-TYPE",
+}
+def get_lextreme_instructions(subset):
+    return INSTRUCTIONS[subset]
+class LEXTREMEScenario(Scenario):
+    """
+    The dataset consists of 11 diverse multilingual legal NLU tasks.
+    6 tasks have one single configuration and 5 tasks have two or three configurations.
+    This leads to a total of 18 tasks (8 single-label text classification tasks,
+    5 multi-label text classification tasks and 5 token-classification tasks).
+    Find more information on the dataset here: https://huggingface.co/datasets/joelito/lextreme
+    We prompt models using the following format (example for german_argument_mining)
+        <sentence>
+        Urteilsstil:
+        Target completion:
+            <sentence> (<sentence>:conclusion, subsumption, definition or other)
+    Using an example from the training dataset, we have
+    ```
+    Die Klage ist hinsichtlich der begehrten „Umzugkosten“ und hinsichtlich der begehrten
+    „Übernahme der durch den Rechtsstreit gegen das Jobcenter verursachten tatsächlichen Kosten“ insgesamt unzulässig.
+    Urteilsstil:
+    Target completion:
+        conclusion
+    ```
+    """
+    name = "lextreme"
+    description = "Multilingual Legal Text Classification and Named Entity Recognition dataset."
+    tags = ["single_label_text_classification", "multi_label_text_classification", "named_entity_recognition"]
+    # Mapping from HELM splits to HF splits
+    splits_mapping = {
+        TRAIN_SPLIT: datasets.Split.TRAIN,
+        VALID_SPLIT: datasets.Split.VALIDATION,
+        TEST_SPLIT: datasets.Split.TEST,
+    }
+    dataset_name = "joelito/lextreme"
+    max_number_of_wrong_answers = 30
+    delimiter = '" "'  # we choose quotes and whitespace as a delimiter because this is what worked for gpt3
+    ner_class_mapping = {
+        LENER_BR: [
+            "O",
+            "B-ORGANIZACAO",
+            "I-ORGANIZACAO",
+            "B-PESSOA",
+            "I-PESSOA",
+            "B-TEMPO",
+            "I-TEMPO",
+            "B-LOCAL",
+            "I-LOCAL",
+            "B-LEGISLACAO",
+            "I-LEGISLACAO",
+            "B-JURISPRUDENCIA",
+            "I-JURISPRUDENCIA",
+        ],
+        LEGALNERO: [
+            "O",
+            "B-TIME",
+            "I-TIME",
+            "B-LEGAL",
+            "I-LEGAL",
+            "B-ORG",
+            "I-ORG",
+            "B-LOC",
+            "I-LOC",
+            "B-PER",
+            "I-PER",
+        ],
+        GREEK_LEGAL_NER: [
+            "O",
+            "B-ORG",
+            "I-ORG",
+            "B-GPE",
+            "I-GPE",
+            "B-LEG-REFS",
+            "I-LEG-REFS",
+            "B-PUBLIC-DOCS",
+            "I-PUBLIC-DOCS",
+            "B-PERSON",
+            "I-PERSON",
+            "B-FACILITY",
+            "I-FACILITY",
+            "B-LOCATION-UNK",
+            "I-LOCATION-UNK",
+            "B-LOCATION-NAT",
+            "I-LOCATION-NAT",
+        ],
+        MAPA_COARSE: [
+            "O",
+            "B-ORGANISATION",
+            "I-ORGANISATION",
+            "B-ADDRESS",
+            "I-ADDRESS",
+            "B-DATE",
+            "I-DATE",
+            "B-PERSON",
+            "I-PERSON",
+            "B-AMOUNT",
+            "I-AMOUNT",
+            "B-TIME",
+            "I-TIME",
+        ],
+        MAPA_FINE: [
+            "O",
+            "B-BUILDING",
+            "I-BUILDING",
+            "B-CITY",
+            "I-CITY",
+            "B-COUNTRY",
+            "I-COUNTRY",
+            "B-PLACE",
+            "I-PLACE",
+            "B-TERRITORY",
+            "I-TERRITORY",
+            "I-UNIT",
+            "B-UNIT",
+            "B-VALUE",
+            "I-VALUE",
+            "B-YEAR",
+            "I-YEAR",
+            "B-STANDARD ABBREVIATION",
+            "I-STANDARD ABBREVIATION",
+            "B-MONTH",
+            "I-MONTH",
+            "B-DAY",
+            "I-DAY",
+            "B-AGE",
+            "I-AGE",
+            "B-ETHNIC CATEGORY",
+            "I-ETHNIC CATEGORY",
+            "B-FAMILY NAME",
+            "I-FAMILY NAME",
+            "B-INITIAL NAME",
+            "I-INITIAL NAME",
+            "B-MARITAL STATUS",
+            "I-MARITAL STATUS",
+            "B-PROFESSION",
+            "I-PROFESSION",
+            "B-ROLE",
+            "I-ROLE",
+            "B-NATIONALITY",
+            "I-NATIONALITY",
+            "B-TITLE",
+            "I-TITLE",
+            "B-URL",
+            "I-URL",
+            "B-TYPE",
+            "I-TYPE",
+        ],
+    }
+    def __init__(self, subset: str):
+        super().__init__()
+        assert subset in list(TASK_CODE_MAPPING.keys()) + ["all"], f"Unknown subset: {subset}"
+        self.subsets = [subset] if subset != "all" else list(TASK_CODE_MAPPING.keys())
+        self.random: random.Random = random.Random(42)
+    def get_instances_for_subset(self, config: str) -> List[Instance]:
+        task_code = TASK_CODE_MAPPING[config]
+        # Load dataset
+        cache_dir = str(Path(self.output_path) / "data")
+        dataset: Any = load_dataset(self.dataset_name, config, cache_dir=cache_dir)
+        if task_code == TaskType.SLTC:
+            class_label = dataset["train"].features["label"]
+            label_classes = class_label.names
+        elif task_code == TaskType.MLTC:
+            # construct the label classes
+            label_classes = set()
+            for split in self.splits_mapping.values():
+                for example in dataset[split]:
+                    label_classes |= set(example["label"])  # add all new labels to the set
+            label_classes = sorted(list(map(str, label_classes)))  # convert everything to a string
+        elif task_code == TaskType.NER:
+            label_classes = self.ner_class_mapping[config]
+        def generate_instance(example, split: str):
+            # get correct labels
+            if task_code == TaskType.SLTC:
+                correct_label = class_label.int2str(example["label"])  # get label name for correct label
+                correct_labels = correct_label if isinstance(correct_label, list) else [correct_label]
+            elif task_code == TaskType.MLTC:
+                correct_labels = list(map(str, example["label"]))  # here we don't have any mapping to label names
+            elif task_code == TaskType.NER:
+                correct_labels = [label_classes[label] for label in example["label"]]
+            # construct wrong references
+            wrong_references = []
+            if task_code in [TaskType.SLTC, TaskType.MLTC]:
+                for label_name in label_classes:
+                    if label_name not in correct_labels:
+                        wrong_reference = Reference(output=Output(label_name), tags=[])  # Wrong output
+                        wrong_references.append(wrong_reference)
+            elif task_code == TaskType.NER:
+                if len(set(correct_labels)) > 1:  # make sure that the correct labels are not only 'O's
+                    for label_name in label_classes:
+                        if label_name not in correct_labels and label_name != "O":
+                            # just replace the non-'O' labels with the new label_name for a fake example
+                            new_labels = [label_name if label != "O" else label for label in correct_labels]
+                            wrong_reference = Reference(
+                                output=Output(construct_ner_sequence(new_labels)), tags=[]
+                            )  # Wrong output
+                            wrong_references.append(wrong_reference)
+            wrong_references = reduce_wrong_reference_count(wrong_references)
+            # construct correct references and input
+            if task_code in [TaskType.SLTC, TaskType.MLTC]:
+                input_text = example["input"]
+                if "multi_eurlex" in config:
+                    input_text = ast.literal_eval(input_text)
+                    assert isinstance(input_text, dict)
+                    languages = list(input_text.keys())
+                    input_text = input_text[self.random.choice(languages)]  # just choose a random language
+                correct_references = [
+                    Reference(output=Output(correct_label), tags=[CORRECT_TAG]) for correct_label in correct_labels
+                ]  # for MLTC we have multiple correct ones
+            elif task_code == TaskType.NER:
+                input_text = construct_ner_sequence(example["input"])
+                correct_references = [
+                    Reference(output=Output(construct_ner_sequence(correct_labels)), tags=[CORRECT_TAG])
+                ]
+            return Instance(input=Input(input_text), references=wrong_references + correct_references, split=split)
+        def construct_ner_sequence(ner_list):
+            return '"' + self.delimiter.join(ner_list) + '"'
+        def reduce_wrong_reference_count(wrong_references):
+            self.random.shuffle(wrong_references)  # shuffle wrong references
+            if len(wrong_references) > self.max_number_of_wrong_answers:
+                # if there are too many wrong references, only take a subset
+                wrong_references = wrong_references[: self.max_number_of_wrong_answers]
+            return wrong_references
+        def generate_instances(split: str):
+            split_dataset = dataset[self.splits_mapping[split]]
+            return [generate_instance(example, split) for example in split_dataset]
+        return generate_instances(TRAIN_SPLIT) + generate_instances(VALID_SPLIT) + generate_instances(TEST_SPLIT)
+    def get_instances(self) -> List[Instance]:
+        instances = []
+        for subset in self.subsets:
+            instances.extend(self.get_instances_for_subset(subset))
+        return instances

helm/benchmark/scenarios/me_q_sum_scenario.py ADDED Viewed

@@ -0,0 +1,86 @@
+import os
+from typing import List
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
+class MeQSumScenario(Scenario):
+    """
+    From "On the Summarization of Consumer Health Questions" (Abacha et al.), MeQSum is a corpus of 1,000 summarized
+    consumer health questions.
+    The following is an example from the dataset:
+    Question:
+    SUBJECT: inversion of long arm chromasome7 MESSAGE: My son has been diagnosed with inversion of long arm
+    chromasome 7 and down syndrome . please could you give me information on the chromasome 7 please because
+    our doctors have not yet mentioned it
+    Summary:
+    Where can I find information on chromosome 7?
+    @Inproceedings{MeQSum,
+    author = {Asma {Ben Abacha} and Dina Demner-Fushman},
+    title = {On the Summarization of Consumer Health Questions},
+    booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, ACL 2019,
+    Florence, Italy, July 28th - August 2},
+    year = {2019},
+    abstract = {Question understanding is one of the main challenges in question answering. In real world applications,
+    users often submit natural language questions that are longer than needed and include peripheral information that
+    increases the complexity of the question, leading to substantially more false positives in answer retrieval. In this
+    paper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of
+    1,000 summarized consumer health questions. We explore data augmentation methods and evaluate state-of-the-art
+    neural abstractive models on this new task. In particular, we show that semantic augmentation from question datasets
+    improves the overall performance, and that pointer-generator networks outperform sequence-to-sequence attentional
+    models on this task, with a ROUGE-1 score of 44.16%. We also present a detailed error analysis and discuss
+    directions for improvement that are specific to question summarization.}}
+    """
+    SOURCE_URL_TEMPLATE: str = (
+        "https://worksheets.codalab.org/rest/bundles/0xd98a53314314445b96b4d703bb2d8c8c/contents/blob/{file_name}"
+    )
+    name = "me_q_sum"
+    description = "MeQSum is a corpus of 1,000 summarized consumer health questions."
+    tags = ["summarization", "biomedical"]
+    def get_instances(self) -> List[Instance]:
+        """
+        Build `Instance`s using the consumer health questions and their summarized versions.
+        """
+        def download_and_read_lines(file_name: str) -> List[str]:
+            file_path: str = os.path.join(data_path, file_name)
+            ensure_file_downloaded(
+                source_url=MeQSumScenario.SOURCE_URL_TEMPLATE.format(file_name=file_name),
+                target_path=file_path,
+                unpack=False,
+            )
+            with open(file_path) as f:
+                return f.read().splitlines()
+        data_path: str = os.path.join(self.output_path, "data")
+        ensure_directory_exists(data_path)
+        instances: List[Instance] = []
+        for split in ALL_SPLITS:
+            dataset_split: str = "val" if split == VALID_SPLIT else split
+            # The files with the questions end with ".source"
+            questions: List[str] = download_and_read_lines(f"{dataset_split}.source")
+            # The files with the summaries end with ".target"
+            summaries: List[str] = download_and_read_lines(f"{dataset_split}.target")
+            for question, summary in zip(questions, summaries):
+                instances.append(
+                    Instance(
+                        input=Input(text=question),
+                        references=[Reference(output=Output(text=summary), tags=[CORRECT_TAG])],
+                        split=split,
+                    )
+                )
+        return instances

helm/benchmark/scenarios/med_dialog_scenario.py ADDED Viewed

@@ -0,0 +1,132 @@
+import json
+import os
+from typing import List
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, Input, Output
+class MedDialogScenario(Scenario):
+    """
+    "The MedDialog dataset (English) contains conversations between doctors and patients.
+    It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added.
+    The raw dialogues are from healthcaremagic.com and icliniq.com. All copyrights of the data belong
+    to healthcaremagic.com and icliniq.com."
+    The following is an example from the healthcaremagic.com subset:
+    Patient: I get cramps on top of my left forearm and hand and it causes my hand and fingers to draw up and it
+    hurts. It mainly does this when I bend my arm. I ve been told that I have a slight pinch in a nerve in my neck.
+    Could this be a cause? I don t think so. Doctor: Hi there. It may sound difficult to believe it ,but the nerves
+    which supply your forearms and hand, start at the level of spinal cord and on their way towards the forearm and
+    hand regions which they supply, the course of these nerves pass through difference fascial and muscular planes
+    that can make them susceptible to entrapment neuropathies. Its a group of conditions where a nerve gets
+    compressed between a muscle and a bone, or between the fibers of a muscle that it pierces or passes through.
+    Also, the compression can happen when the nerves are travelling around a blood vessel which can mechanically put
+    pressure on them. Usually patients who would be having such a problem present with a dull aching pain over the
+    arm and forearm. If it is not too severe and does not cause any neurological deficits then conservative management
+    with Pregabalin and Vitamin B complex tablets, activity modifications and physiotherapy can be started which
+    will provide relief. Avoid the activities which exaggerate your problem.
+    Could painful forearms be related to pinched nerve in neck?
+    The following is an example from the icliniq.com subset:
+    Patient: Hello doctor,  We are looking for a second opinion on my friend's MRI scan of both the knee joints as he
+    is experiencing excruciating pain just above the patella. He has a sudden onset of severe pain on both the knee
+    joints about two weeks ago. Previously he had a similar episode about two to three months ago and it subsided
+    after resting and painkillers. Doctor: Hi. I viewed the right and left knee MRI images. (attachment removed to
+    protect patient identity).  Left knee: The MRI, left knee joint shows a complex tear in the posterior horn of the
+    medial meniscus area and mild left knee joint effusion. There is some fluid between the semimembranous and medial
+    head of gastrocnemius muscles. There is a small area of focal cartilage defect in the upper pole of the patella
+    with mild edematous fat. The anterior and posterior cruciate ligaments are normal. The medial and lateral
+    collateral ligaments are normal. Right knee: The right knee joint shows mild increased signal intensity in the
+    posterior horn of the medial meniscus area and minimal knee joint effusion. There is minimal fluid in the back
+    of the lower thigh and not significant. There is a suspicious strain in the left anterior cruciate ligament
+    interiorly but largely the attachments are normal. The posterior cruciate ligament is normal. There are subtle
+    changes in the upper pole area of the right patella and mild edema. There is mild edema around the bilateral
+    distal quadriceps tendons, but there is no obvious tear of the tendons.
+    My friend has excruciating knee pain. Please interpret his MRI report
+    Paper: https://arxiv.org/abs/2004.03329
+    Code: https://github.com/UCSD-AI4H/Medical-Dialogue-System
+    @article{chen2020meddiag,
+      title={MedDialog: a large-scale medical dialogue dataset},
+      author={Chen, Shu and Ju, Zeqian and Dong, Xiangyu and Fang, Hongchao and Wang, Sicheng and Yang, Yue and Zeng,
+              Jiaqi and Zhang, Ruisi and Zhang, Ruoyu and Zhou, Meng and Zhu, Penghui and Xie, Pengtao},
+      journal={arXiv preprint arXiv:2004.03329},
+      year={2020}
+    }
+    We used the data preprocessing from "BioBART: Pretraining and Evaluation o A Biomedical Generative Language Model"
+    (Yuan et al.) and generated the following splits:
+    |Dataset         | Train      | Valid   | Test   |
+    |--------------- |------------|---------|--------|
+    |HealthCareMagic | 181,122    | 22,641  | 22,642 |
+    |iCliniq         | 24,851     | 3,105   | 3,108  |
+    Yuan et al. described, "HealthCareMagic's summaries are more abstractive and are written in a formal style,
+    unlike iCliniq's patient-written summaries."
+    Paper: https://arxiv.org/abs/2204.03905
+    Code: https://github.com/GanjinZero/BioBART
+    @misc{https://doi.org/10.48550/arxiv.2204.03905,
+      doi = {10.48550/ARXIV.2204.03905},
+      url = {https://arxiv.org/abs/2204.03905},
+      author = {Yuan, Hongyi and Yuan, Zheng and Gan, Ruyi and Zhang, Jiaxing and Xie, Yutao and Yu, Sheng},
+      keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences,
+                  FOS: Computer and information sciences},
+      title = {BioBART: Pretraining and Evaluation of A Biomedical Generative Language Model},
+      publisher = {arXiv},
+      year = {2022},
+      copyright = {arXiv.org perpetual, non-exclusive license}
+    }
+    """
+    name = "med_dialog"
+    description = (
+        "The MedDialog dataset (English) contains conversations between doctors and patients. "
+        "It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added. "
+        "The raw dialogues are from healthcaremagic.com and icliniq.com."
+    )
+    tags = ["dialogue", "biomedical"]
+    def __init__(self, subset: str):
+        super().__init__()
+        assert subset in ["healthcaremagic", "icliniq"], f"Invalid subset specified for {self.name}: {subset}."
+        self.subset: str = subset
+    def get_instances(self) -> List[Instance]:
+        data_path: str = os.path.join(self.output_path, self.subset)
+        ensure_directory_exists(data_path)
+        instances: List[Instance] = []
+        for split in ALL_SPLITS:
+            split_file_name: str = f"{split}.json"
+            split_path: str = os.path.join(data_path, split_file_name)
+            ensure_file_downloaded(
+                source_url="https://worksheets.codalab.org/rest/bundles/0x82f0c47f6d3e4462ae9ef8ea39eebe64/"
+                f"contents/blob/{self.subset}/{split_file_name}",
+                target_path=split_path,
+                unpack=False,
+            )
+            with open(split_path, "r") as f:
+                examples: List = json.load(f)["data"]
+                for example in examples:
+                    instances.append(
+                        Instance(
+                            input=Input(text=example["src"]),
+                            references=[Reference(Output(text=example["tgt"]), tags=[CORRECT_TAG])],
+                            split=split,
+                        )
+                    )
+        return instances

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl