PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show

helm/benchmark/scenarios/code_scenario.py CHANGED Viewed

@@ -55,6 +55,7 @@ import os
 import sys
 from typing import List, Dict, Iterable, Optional, cast
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.code_scenario_helper import run as run_reindent
@@ -69,6 +70,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -331,3 +333,29 @@ class CodeScenario(Scenario):
             raise ValueError(f"Unknown dataset: {self.dataset}")
         return cast(List[Instance], instances)
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.dataset == "humaneval":
+            return ScenarioMetadata(
+                name="code_humaneval",
+                display_name="HumanEval (Code)",
+                description="The HumanEval benchmark for measuring functional correctness for synthesizing "
+                "programs from docstrings [(Chen et al., "
+                "2021)](https://arxiv.org/pdf/2107.03374.pdf).",
+                taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+                main_metric="pass",
+                main_split="test",
+            )
+        elif self.dataset == "apps":
+            return ScenarioMetadata(
+                name="code_apps",
+                display_name="APPS (Code)",
+                description="The APPS benchmark for measuring competence on code challenges [(Hendrycks et "
+                "al., "
+                "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/c24cd76e1ce41366a4bbe8a49b02a028-Abstract-round2.html).",
+                taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+                main_metric="test_avg",
+                main_split="test",
+            )
+        else:
+            raise Exception(f"Unknown dataset {self.dataset}")

helm/benchmark/scenarios/commonsense_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -72,6 +74,19 @@ class HellaSwagScenario(Scenario):
         assert len(answers) == 4
         return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="hellaswag",
+            display_name="HellaSwag",
+            description="The HellaSwag benchmark for commonsense reasoning in question answering "
+            "[(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).",
+            taxonomy=TaxonomyInfo(
+                task="question answering", what="commonsense reasoning", when="?", who="?", language="English"
+            ),
+            main_metric="exact_match",
+            main_split="valid",
+        )
 class OpenBookQA(Scenario):
     name = "openbookqa"
@@ -113,6 +128,23 @@ class OpenBookQA(Scenario):
         assert item["question"]["choices"][correct_choice]["label"] == item["answerKey"]
         return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="openbookqa",
+            display_name="OpenbookQA",
+            description="The OpenbookQA benchmark for commonsense-intensive open book question "
+            "answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="elementary science",
+                when="2018",
+                who="Amazon Mechnical Turk workers",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
 class CommonSenseQAScenario(Scenario):
     name = "commonsenseqa"

helm/benchmark/scenarios/compositional_instructions.yaml ADDED Viewed

@@ -0,0 +1,70 @@
+rules:
+- category: Root
+  expansions:
+    - text: ${Task} ${Style}
+- category: Task
+  expansions:
+    - text: Explain ${HowTo}
+    - text: Explain ${Topic}
+    - text: Tell me a joke about ${Topic}
+    - text: Tell me a joke about ${TopicPair}
+    - text: What are the similarities between ${TopicPair}
+    - text: What are the differences between ${TopicPair}
+    - text: Tell me 5 surprising facts about ${Topic}
+    - text: Persuade me to learn about ${Topic}
+- category: HowTo
+  expansions:
+    - text: how airplanes fly
+    - text: how maglev trains work
+    - text: how to grow tomatoes in the wintertime
+- category: Topic
+  expansions:
+    - text: the quicksort algorithm
+    - text: stochastic gradient descent
+    - text: the Great Vowel Shift
+    - text: northern lights
+    - text: the Romantic period
+    - text: the Civil Rights movement
+    - text: the Pacific Northwest
+    - text: El Niño
+- category: TopicPair
+  expansions:
+    - text: north and south
+    - text: gradient descent and gradient ascent
+    - text: vowels and consonants
+    - text: C and C++
+    - text: Google and Microsoft
+    - text: US and Canada
+    - text: cats and dogs
+    - text: the Baroque period and the Romantic period
+- category: Style
+  expansions:
+    - text: as a paragraph.
+    - text: as a haiku.
+    - text: as a limerick.
+    - text: in the style of a Shakespeare sonnet.
+    - text: in the style of a court case.
+    - text: in the style of Snoop Dogg.
+    - text: so that a ${Age}-year old can understand it.
+    - text: in ${Language}.
+    - text: in 3 bullet points.
+    - text: in 8 bullet points.
+- category: Age
+  expansions:
+    - text: "5"
+    - text: "9"
+    - text: "13"
+- category: Language
+  expansions:
+    - text: Italian
+    - text: Greek
+    - text: Indian
+    - text: Chinese
+    - text: Thai

helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import Dict, List, Any
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     CORRECT_TAG,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_file_downloaded
@@ -95,3 +97,22 @@ class ConvFinQACalcScenario(Scenario):
                 for raw_instance in raw_instances:
                     instances.append(self.convert_to_instance(raw_instance, split))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="conv_fin_qa_calc",
+            display_name="ConvFinQACalc",
+            short_display_name=None,
+            description="A mathematical calculation benchmark based on ConvFinQA: Exploring the Chain "
+            "of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey "
+            "al., 2022)](https://arxiv.org/pdf/2210.03849.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="question answering with numeric reasoning",
+                what="financial reports",
+                when="1999 to 2019",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="float_equiv",
+            main_split="valid",
+        )

helm/benchmark/scenarios/copyright_scenario.py CHANGED Viewed

@@ -3,8 +3,18 @@ import os
 import tqdm
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, CORRECT_TAG, TEST_SPLIT, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Output,
+    ScenarioMetadata,
+)
 datatag2hash_text = {
     # The "average" book.
@@ -81,3 +91,27 @@ class CopyrightScenario(Scenario):
                 ),
             )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.datatag in datatag2hash_code:
+            return ScenarioMetadata(
+                name="copyright_code",
+                display_name="Copyright (code)",
+                description="Scenario introduced in this work to measure copyright and memorization "
+                "behavior for code, based off of [Carlini et al. "
+                "(2021)](https://www.usenix.org/biblio-11958).",
+                taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+                main_metric="unknown",
+                main_split="test",
+            )
+        else:
+            return ScenarioMetadata(
+                name="copyright_text",
+                display_name="Copyright (text)",
+                description="Scenario introduced in this work to measure copyright and memorization "
+                "behavior for books, based off of [Carlini et al. "
+                "(2021)](https://www.usenix.org/biblio-11958).",
+                taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+                main_metric="unknown",
+                main_split="test",
+            )

helm/benchmark/scenarios/cti_to_mitre_scenario.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Any, List, Dict
 import pandas as pd
 from pandas import DataFrame
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -238,3 +240,22 @@ class CtiToMitreScenario(Scenario):
         # return all instances
         all_instances = instances_train + instances_test
         return all_instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="cti_to_mitre",
+            display_name="CTI-to-MITRE Cyber Threat Intelligence",
+            short_display_name=None,
+            description="A classification benchmark based on Automatic Mapping of Unstructured Cyber "
+            "Threat Intelligence - An Experimental Study [(Orbinato et al., "
+            "2022)](https://arxiv.org/pdf/2208.12144.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="text classification",
+                what="Descriptions of malicious techniques",
+                when="Before 2022",
+                who="Security professionals",
+                language="English",
+            ),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/czech_bank_qa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datasets
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -128,3 +130,19 @@ CREATE TABLE "trans" (
             instance = Instance(input=input, references=references, split=TEST_SPLIT)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="czech_bank_qa",
+            display_name="CzechBankQA",
+            description="The CzechBankQA",
+            taxonomy=TaxonomyInfo(
+                task="text-to-SQL",
+                what="queries from financial experts",
+                when="1999",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="error_rate",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py CHANGED Viewed

@@ -1,9 +1,19 @@
 import os
 import json
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.common.general import ensure_directory_exists
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, CORRECT_TAG, Output, VALID_SPLIT
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    CORRECT_TAG,
+    Output,
+    VALID_SPLIT,
+    ScenarioMetadata,
+)
 TASKS = {
     "counterfactual": ["snli_premise", "snli_hypothesis"]
@@ -167,3 +177,14 @@ class DecodingTrustAdvDemoScenario(Scenario):
             if self.demo_name in ["cf", "zero"]:
                 break
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_adv_demonstration",
+            display_name="DecodingTrust - Adversarial Demonstrations",
+            short_display_name="AdvDemo",
+            description="Robustness analysis of LM generations when facing adversarial demonstrations",
+            taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py CHANGED Viewed

@@ -1,8 +1,18 @@
 import os
 import json
 from typing import List, Dict
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT, Reference, Output, CORRECT_TAG
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    VALID_SPLIT,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    ScenarioMetadata,
+)
 TASK_DESCRIPTIONS = {
     "sst2": "For the given sentence, label the sentiment of the sentence as positive or negative. The answer should be "
@@ -119,3 +129,15 @@ class DecodingTrustAdvRobustnessScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_adv_robustness",
+            display_name="DecodingTrust - AdvGLUE++",
+            short_display_name="AdvGLUE++",
+            description="Adversarial perturbations of the GLUE dataset generated against open-source "
+            "LLMs including Alpaca, Vicuna, and Stable-Vicuna",
+            taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_fairness_scenario.py CHANGED Viewed

@@ -1,8 +1,18 @@
 import os
 import json
 from typing import List, Dict
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT, Reference, Output, CORRECT_TAG
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    VALID_SPLIT,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    ScenarioMetadata,
+)
 TASK_DESCRIPTIONS = {
     "predict_annual_income": "For each snippet of text, predict whether the person described has an annual income "
@@ -75,3 +85,14 @@ class DecodingTrustFairnessScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_fairness",
+            display_name="DecodingTrust - Fairness",
+            short_display_name="Fairness",
+            description="Fairness analysis of LLMs",
+            taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py CHANGED Viewed

@@ -4,8 +4,17 @@ import random
 import numpy as np
 import pandas as pd
 from typing import List, Dict, Tuple, Union
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, CORRECT_TAG
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    ScenarioMetadata,
+)
 OPTION_DICT: Dict[str, Union[List[int], List[str]]] = {
     "ethics_commonsense_short": ["not wrong", "wrong"],
@@ -322,3 +331,14 @@ class DecodingTrustMachineEthicsScenario(Scenario):
         )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_machine_ethics",
+            display_name="DecodingTrust - Ethics",
+            short_display_name="Ethics",
+            description="Evaluation of the understanding of ethical behaviors of LLMs",
+            taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import json
 from typing import List, Dict
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
     CORRECT_TAG,
+    ScenarioMetadata,
 )
 TASK = {
@@ -202,3 +204,14 @@ class DecodingTrustOODRobustnessScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_adv_demonstration",
+            display_name="DecodingTrust - Adversarial Demonstrations",
+            short_display_name="AdvDemo",
+            description="Robustness analysis of LM generations when facing adversarial demonstrations",
+            taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_privacy_scenario.py CHANGED Viewed

@@ -4,8 +4,9 @@ import string
 import random
 from tqdm import tqdm
 from typing import List, Dict, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT, ScenarioMetadata
 option_keys = ["pii", "privacy_understanding", "enron_email_extraction"]
@@ -217,6 +218,17 @@ class DecodingTrustPrivacyScenario(Scenario):
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_privacy",
+            display_name="DecodingTrust - Privacy",
+            short_display_name="Privacy",
+            description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
+            taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )
 def get_local_domain(email):
     return email.split("@")

helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import os
 from typing import List, Dict
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import Reference, Output
@@ -66,3 +67,14 @@ class DecodingTrustStereotypeBiasScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_stereotype_bias",
+            display_name="DecodingTrust - Stereotype Bias",
+            short_display_name="Stereotype",
+            description="Manually crafted stereotype user prompts from DecodingTrust",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py CHANGED Viewed

@@ -3,8 +3,9 @@ import os
 import random
 from typing import List, Dict
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
@@ -76,3 +77,14 @@ class DecodingTrustToxicityPromptsScenario(Scenario):
         random.shuffle(instances)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="decodingtrust_toxicity_prompts",
+            display_name="DecodingTrust - Toxicity",
+            short_display_name="Toxicity",
+            description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/dischargeme_scenario.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 import pandas as pd
@@ -170,3 +172,25 @@ class DischargeMeScenario(Scenario):
             lines = file.readlines()
         lines = [line.strip() for line in lines]
         return lines
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="dischargeme",
+            display_name="DischargeMe",
+            short_display_name="DischargeMe",
+            description="DischargeMe is a benchmark designed to evaluate clinical text generation. It "
+            "pairs discharge summaries and radiology reports from MIMIC-IV with generation "
+            "tasks such as writing discharge instructions or summarizing the brief hospital "
+            "course. The benchmark assesses a model's ability to generate patient-facing "
+            "documentation that is complete, empathetic, and clinically accurate [(Xu, "
+            "2024)](https://physionet.org/content/discharge-me/1.3/).",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate discharge instructions from hospital notes",
+                when="Upon hospital discharge",
+                who="Clinician",
+                language="English",
+            ),
+            main_metric="dischargeme_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/disinformation_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import List, Dict, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
@@ -175,3 +177,23 @@ class DisinformationScenario(Scenario):
             instances = self.create_wedging_instances(data)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.capability == "reiteration":
+            name = "disinformation_reiteration"
+            display_name = "Disinformation (reiteration)"
+        elif self.capability == "wedging":
+            name = "disinformation_wedging"
+            display_name = "Disinformation (wedging)"
+        else:
+            raise Exception(f"Unknown capability {self.capability}")
+        return ScenarioMetadata(
+            name=name,
+            display_name=display_name,
+            description="Scenario from [Buchanan et al. "
+            "(2021)](https://cset.georgetown.edu/publication/truth-lies-and-automation/) "
+            "that tests the ability to generate divisive and wedging content.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="unknown",
+            main_split="valid",
+        )

crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl