PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show

helm/benchmark/scenarios/dyck_language_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import numpy as np
 import random
 from typing import List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -234,3 +236,16 @@ class DyckLanguageScenario(Scenario):
             not_allowed=train_inputs,
         )
         return train_instances + test_instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="dyck_language",
+            display_name="Dyck",
+            description="Scenario testing hierarchical reasoning through the Dyck formal languages "
+            "[(Suzgun et al., 2019)](https://aclanthology.org/W19-3905/).",
+            taxonomy=TaxonomyInfo(
+                task="next-word prediction", what="Dyck formal language", when="n/a", who="n/a", language="synthetic"
+            ),
+            main_metric="exact_match_indicator",
+            main_split="test",
+        )

helm/benchmark/scenarios/ehrshot_scenario.py CHANGED Viewed

@@ -7,6 +7,7 @@ from functools import partial
 from tqdm import tqdm
 from typing import Any, Dict, List, Optional, Mapping
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 ##################################
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
             )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="ehrshot",
+            display_name="EHRSHOT",
+            description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
+            "future clinical events using structured EHR code sequences. Each instance "
+            "contains a patient's historical EHR data and a forward-looking clinical "
+            "question about whether a particular diagnosis, lab result, or hospital event "
+            "will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Predict whether a medical event will occur in the future based " "on EHR codes",
+                when="Future prediction",
+                who="Clinician, Insurer",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/enem_challenge_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import List, Any
 from pathlib import Path
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="enem_challenge",
+            display_name="ENEM Challenge",
+            short_display_name=None,
+            description="ENEM Challenge",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="general academic subjects",
+                when="between 2009 and 2023",
+                who="brazilian ministry of education",
+                language="Portuguese",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/entity_data_imputation_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from pathlib import Path
 from typing import List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="entity_data_imputation",
+            display_name="Data imputation",
+            description="Scenario from [Mei et al. "
+            "(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
+            "to impute missing entities in a data table.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/entity_matching_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import pandas as pd
 from pathlib import Path
 from typing import Dict, List, Tuple
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="entity_matching",
+            display_name="Entity matching",
+            description="Scenario from Magellan [(Konda et al., "
+            "2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
+            "to determine if two entities match.",
+            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/fin_qa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import json
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TRAIN_SPLIT,
     TEST_SPLIT,
     CORRECT_TAG,
+    ScenarioMetadata,
 )
@@ -117,3 +119,21 @@ class FinQAScenario(Scenario):
                     )
                     instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="fin_qa",
+            display_name="FinQA",
+            description="The FinQA benchmark for numeric reasoning over financial data, with question "
+            "answering pairs written by financial experts over financial reports [(Chen et "
+            "al., 2021)](https://arxiv.org/abs/2109.00122/).",
+            taxonomy=TaxonomyInfo(
+                task="question answering with numeric reasoning",
+                what="financial reports",
+                when="1999 to 2019",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="program_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/financebench_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TRAIN_SPLIT,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
@@ -51,3 +53,22 @@ class FinanceBenchScenario(Scenario):
         for train_index in train_indexes:
             instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="financebench",
+            display_name="FinanceBench",
+            description="FinanceBench is a benchmark for open book financial question answering. It "
+            "comprises 10,231 questions about publicly traded companies, with corresponding "
+            "answers and evidence strings [(Islam et al., "
+            "2023)](https://arxiv.org/abs/2311.11944/).",
+            taxonomy=TaxonomyInfo(
+                task="question answering with numeric reasoning",
+                what="financial reports",
+                when="2015 to 2023",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="annotation_financebench_label_correct_answer",
+            main_split="test",
+        )

helm/benchmark/scenarios/financial_phrasebank_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative"""  # noqa: E501
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="financial_phrasebank",
+            display_name="Financial Phrasebank (Sentiment Classification)",
+            short_display_name=None,
+            description="A sentiment classification benchmark based on the dataset from Good Debt or "
+            "Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
+            "2013)](https://arxiv.org/abs/1307.5336).",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="phrases from financial news texts and company press releases",
+                when="before 2013",
+                who="annotators with adequate business education background",
+                language="English",
+            ),
+            main_metric="classification_weighted_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/gold_commodity_news_scenario.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import List
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.runner import TRAIN_SPLIT
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Scenario,
     Output,
+    ScenarioMetadata,
 )
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
         for train_index in train_indexes:
             instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="gold_commodity_news",
+            display_name="Gold Commodity News",
+            short_display_name=None,
+            description="A classification benchmark based on a dataset of human-annotated gold "
+            "commodity news headlines ([Sinha & Khandait, "
+            "2019](https://arxiv.org/abs/2009.04202)).",
+            taxonomy=TaxonomyInfo(
+                task="text classification",
+                what="gold commodity news headlines",
+                when="2000-2019",
+                who="financial journalists",
+                language="English",
+            ),
+            main_metric="classification_weighted_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/gpqa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datasets
 import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="GPQA",
+            description=self.description,
+            main_metric="chain_of_thought_correctness",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="complex questions across various disciplines",
+                who="domain experts",
+                when="2024",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/grammar_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
 from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
         instances: List[Instance] = list(map(derivation_to_instance, derivations))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="grammar",
+            display_name="Best ChatGPT Prompts",
+            short_display_name="Best ChatGPT Prompts",
+            description="A list of “best ChatGPT prompts to power your workflow” summarized by "
+            "[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
+            taxonomy=TaxonomyInfo(
+                task="open-ended instruction following",
+                what="Instructions for LLMs",
+                when="2023",
+                who="Gridfiti Staff",
+                language="English",
+            ),
+            main_metric="Helpfulness",
+            main_split="test",
+        )

helm/benchmark/scenarios/gsm_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -65,3 +67,22 @@ class GSM8KScenario(Scenario):
                         ),
                     )
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="gsm",
+            display_name="GSM8K (Grade School Math)",
+            short_display_name="GSM8K",
+            description="The grade school math word problems dataset (GSM8K) for testing mathematical "
+            "reasoning on grade-school math problems [(Cobbe et al., "
+            "2021)](https://arxiv.org/pdf/2110.14168.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="numeric answer question answering",
+                what="grade school math word problems",
+                when="2021",
+                who="contractors on Upwork and Surge AI",
+                language="English",
+            ),
+            main_metric="final_number_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import pandas as pd
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
 class HarmBenchGCGTransferScenario(Scenario):
@@ -48,3 +49,13 @@ class HarmBenchGCGTransferScenario(Scenario):
             instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="harm_bench_gcg_transfer",
+            display_name="HarmBenchGCGTransfer",
+            description="HarmBenchGCGTransfer",
+            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            main_metric="safety_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/harm_bench_scenario.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import pandas as pd
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
 class HarmBenchScenario(Scenario):
@@ -57,3 +58,13 @@ class HarmBenchScenario(Scenario):
                 instance = Instance(input=input, split=TEST_SPLIT, references=references, sub_split=tag, id=id)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="harm_bench",
+            display_name="HarmBench",
+            description="HarmBench",
+            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            main_metric="safety_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/headqa_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Optional
 from datasets import DatasetDict, load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
     Reference,
     Scenario,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -134,3 +136,23 @@ class HeadQAScenario(Scenario):
                     )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="head_qa",
+            display_name="HeadQA",
+            description="HeadQA is a benchmark consisting of biomedical multiple-choice questions "
+            "intended to evaluate a model's medical knowledge and reasoning. Each instance "
+            "presents a clinical or scientific question with four answer options, requiring "
+            "the model to select the most appropriate answer [(Vilares et al., "
+            "2019)](https://arxiv.org/abs/1906.04701).",
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Medical knowledge testing",
+                when="Any",
+                who="Medical student, Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py CHANGED Viewed

@@ -2,11 +2,13 @@ import csv
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
     TEST_SPLIT,
     Input,
+    ScenarioMetadata,
 )
@@ -35,3 +37,14 @@ class HelpdeskCallSummarizationScenario(Scenario):
             instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="helpdesk_call_summarization",
+            display_name="Helpdesk Call summarization",
+            short_display_name=None,
+            description="Helpdesk Call summarization",
+            taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
+            main_metric="unknown",
+            main_split="test",
+        )

helm/benchmark/scenarios/ice_scenario.py CHANGED Viewed

@@ -4,9 +4,10 @@ from typing import List, Union
 from enum import Enum
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
-from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
 try:
     # pd.read_excel() uses xlrd
@@ -467,3 +468,22 @@ class ICEScenario(Scenario):
                     instances.append(Instance(Input(text=t), references=[], split=TEST_SPLIT))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="ice",
+            display_name="ICE (International Corpus of English)",
+            short_display_name="ICE",
+            description="The International Corpus of English (ICE) drawn from English speakers from "
+            "various places in the world, initiated by [Greenbaum "
+            "(1991)](https://www.cambridge.org/core/journals/english-today/article/abs/ice-the-international-corpus-of-english/47808205394C538393C3FD8E62E5E701).",
+            taxonomy=TaxonomyInfo(
+                task="language modeling",
+                what="?",
+                when="?",
+                who="?",
+                language="English varieties from different nations",
+            ),
+            main_metric="bits_per_byte",
+            main_split="test",
+        )

helm/benchmark/scenarios/ifeval_scenario.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import datasets
 import os
 from typing import List
+from helm.benchmark.presentation.schema import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
+    ScenarioMetadata,
     Instance,
     Input,
     TEST_SPLIT,
@@ -51,3 +53,19 @@ class IFEvalScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="IFEval",
+            description=self.description,
+            main_metric="ifeval_strict_accuracy",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="instruction following",
+                what="verifiable general domain instruction following",
+                who="human annotators",
+                when="2023",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/imdb_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import List, Dict, Optional
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
@@ -143,3 +145,16 @@ class IMDBScenario(Scenario):
         for split in [TRAIN_SPLIT, VALID_SPLIT]:
             instances.extend(self.get_split_instances(target_path, split, contrast_map))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="imdb",
+            display_name="IMDB",
+            description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
+            "2011)](https://aclanthology.org/P11-1015/).",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
+            ),
+            main_metric="quasi_exact_match",
+            main_split="valid",
+        )

crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl