PyPI - crfm-helm - Versions diffs - 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

crfm-helm 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show

helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
-import json
+import os
+from datasets import load_dataset
 from tqdm import tqdm
 from helm.benchmark.scenarios.scenario import (
@@ -13,8 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from huggingface_hub import snapshot_download
-from .ultra_suite_classification_scenario import find_audio_json_pairs
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
 class UltraSuiteDisorderBreakdownScenario(Scenario):
@@ -38,46 +38,38 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
         - Audio files (e.g., .mp3)
         - A JSON file with annotations containing 'disorder_class' field
         """
-        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
-        data_path = snapshot_download(
-            repo_id="SAA-Lab/SLPHelmManualLabels",
-            repo_type="dataset",
-            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
-        )
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        os.makedirs(audio_save_dir, exist_ok=True)
+        print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
+        dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
         instances: List[Instance] = []
         split: str = TEST_SPLIT
-        # Find all pairs of audio and JSON files
-        pairs = find_audio_json_pairs(data_path)
-        print(f"Num pairs: {len(pairs)}")
-        for audio_path, json_path in tqdm(pairs):
+        for idx, row in enumerate(tqdm(dataset["train"])):
             # Load the annotation
-            with open(json_path, "r") as f:
-                annotation = json.load(f)
+            label = row["disorder_type"]
+            transcription = row["transcription"]
-            # Get the correct answer and convert to label
-            if "disorder_type" not in annotation or "transcription" not in annotation:
-                continue
-            label = annotation["disorder_type"]
-            prompt = annotation["transcription"]
+            unique_id = str(idx)
+            local_audio_name = f"{label}_{unique_id}.mp3"
+            local_audio_path = os.path.join(audio_save_dir, local_audio_name)
+            ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
             # Create references for each option
             references: List[Reference] = []
-            correct_label = 0
-            for option in ["typically_developing", "articulation", "phonological"]:
+            options = ["typically_developing", "articulation", "phonological"]
+            if label not in options:
+                continue
+            for option in options:
                 reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
                 references.append(reference)
-                if option == label:
-                    correct_label += 1
-            if correct_label == 0:
-                continue
             # Create the input with audio and instruction
             content = [
-                MediaObject(content_type="audio/mpeg", location=audio_path),
-                MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
+                MediaObject(content_type="audio/mpeg", location=local_audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
             ]
             input = Input(multimedia_content=MultimediaObject(content))

helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import List, Tuple
+from typing import List
 import os
-import json
+from datasets import load_dataset
 from tqdm import tqdm
 from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from huggingface_hub import snapshot_download
-def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
-    """
-    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
-    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
-    Args:
-        directory: Path to the directory containing the files
-    Returns:
-        List of tuples where each tuple contains (mp3_path, json_path)
-    """
-    pairs = []
-    # Walk through all directories and subdirectories
-    for root, _, files in os.walk(directory):
-        # Get all MP3 files in current directory
-        mp3_files = [f for f in files if f.endswith(".mp3")]
-        for mp3_file in mp3_files:
-            base_name = os.path.splitext(mp3_file)[0]
-            json_file = f"{base_name}.json"
-            # Check if corresponding JSON file exists in the same directory
-            if json_file in files:
-                mp3_path = os.path.join(root, mp3_file)
-                json_path = os.path.join(root, json_file)
-                pairs.append((mp3_path, json_path))
-    return pairs
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
 class UltraSuiteDisorderSymptomsScenario(Scenario):
@@ -70,45 +39,37 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
         - Audio files (e.g., .mp3)
         - A JSON file with annotations containing 'answer' field
         """
-        print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
-        data_path = snapshot_download(
-            repo_id="SAA-Lab/SLPHelmManualLabels",
-            repo_type="dataset",
-            revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
-        )
+        audio_save_dir = os.path.join(output_path, "audio_files")
+        os.makedirs(audio_save_dir, exist_ok=True)
+        print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
+        dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
         instances: List[Instance] = []
         split: str = TEST_SPLIT
-        # Find all pairs of audio and JSON files
-        pairs = find_audio_json_pairs(data_path)
-        for audio_path, json_path in tqdm(pairs):
+        for idx, row in enumerate(tqdm(dataset["train"])):
+            label = row["disorder_symptom"]
+            transcription = row["transcription"]
-            # Load the annotation
-            with open(json_path, "r") as f:
-                annotation = json.load(f)
+            unique_id = str(idx)
+            local_audio_name = f"{label}_{unique_id}.mp3"
+            local_audio_path = os.path.join(audio_save_dir, local_audio_name)
+            ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
-            # Get the correct answer and convert to label
-            if "disorder_symptom" not in annotation or "transcription" not in annotation:
-                continue
-            label = annotation["disorder_symptom"]
-            prompt = annotation["transcription"]
             # Create references for each option
             references: List[Reference] = []
-            correct_label = 0
-            for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
+            options = ["substitution", "omission", "addition", "typically_developing", "stuttering"]
+            if label not in options:
+                continue
+            for option in options:
                 reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
                 references.append(reference)
-                if option == label:
-                    correct_label += 1
-            if correct_label == 0:
-                continue
             # Create the input with audio and instruction
             content = [
-                MediaObject(content_type="audio/mpeg", location=audio_path),
-                MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
+                MediaObject(content_type="audio/mpeg", location=local_audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
             ]
             input = Input(multimedia_content=MultimediaObject(content))

helm/benchmark/scenarios/banking77_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datasets
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -54,3 +56,22 @@ class Banking77Scenario(Scenario):
                 instance = Instance(input=input, references=references, split=split_name)
                 instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="banking77",
+            display_name="BANKING77",
+            short_display_name="BANKING77",
+            description="BANKING77 is a benchmark for intent classification of customer service queries "
+            "in the banking domain [(Casanueva et al., "
+            "2020)](https://aclanthology.org/2020.nlp4convai-1.5/).",
+            taxonomy=TaxonomyInfo(
+                task="text classification",
+                what="customer service queries in the banking domain",
+                when="During or before 2020",
+                who="banking customers",
+                language="English",
+            ),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/bbq_scenario.py CHANGED Viewed

@@ -249,6 +249,6 @@ class BBQScenario(Scenario):
             "question answering in ambiguous and unambigous context [(Parrish et al., "
             "2022)](https://aclanthology.org/2022.findings-acl.165/).",
             taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
-            main_metric="quasi_exact_match",
+            main_metric="bbq_accuracy",
             main_split="test",
         )

helm/benchmark/scenarios/bird_sql_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Dict, List
 from filelock import FileLock
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.bird_sql_scenario_helper import (  # type: ignore
@@ -18,6 +19,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -92,3 +94,19 @@ INSERT_YOUR_SQL_QUERY_HERE
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="bird_sql",
+            display_name="BIRD-SQL (Dev)",
+            description="BIRD-SQL (Dev)",
+            taxonomy=TaxonomyInfo(
+                task="text-to-SQL",
+                what="databases from various domains",
+                when="?",
+                who="expert data scientists",
+                language="English",
+            ),
+            main_metric="execution_accuracy",
+            main_split="valid",
+        )

helm/benchmark/scenarios/commonsense_scenario.py CHANGED Viewed

@@ -134,7 +134,13 @@ class OpenBookQA(Scenario):
             display_name="OpenbookQA",
             description="The OpenbookQA benchmark for commonsense-intensive open book question "
             "answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).",
-            taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="elementary science",
+                when="2018",
+                who="Amazon Mechnical Turk workers",
+                language="English",
+            ),
             main_metric="exact_match",
             main_split="test",
         )

helm/benchmark/scenarios/czech_bank_qa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datasets
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -128,3 +130,19 @@ CREATE TABLE "trans" (
             instance = Instance(input=input, references=references, split=TEST_SPLIT)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="czech_bank_qa",
+            display_name="CzechBankQA",
+            description="The CzechBankQA",
+            taxonomy=TaxonomyInfo(
+                task="text-to-SQL",
+                what="queries from financial experts",
+                when="1999",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="error_rate",
+            main_split="test",
+        )

helm/benchmark/scenarios/fin_qa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import json
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     TRAIN_SPLIT,
     TEST_SPLIT,
     CORRECT_TAG,
+    ScenarioMetadata,
 )
@@ -117,3 +119,21 @@ class FinQAScenario(Scenario):
                     )
                     instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="fin_qa",
+            display_name="FinQA",
+            description="The FinQA benchmark for numeric reasoning over financial data, with question "
+            "answering pairs written by financial experts over financial reports [(Chen et "
+            "al., 2021)](https://arxiv.org/abs/2109.00122/).",
+            taxonomy=TaxonomyInfo(
+                task="question answering with numeric reasoning",
+                what="financial reports",
+                when="1999 to 2019",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="program_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/financebench_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import random
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TRAIN_SPLIT,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
@@ -51,3 +53,22 @@ class FinanceBenchScenario(Scenario):
         for train_index in train_indexes:
             instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="financebench",
+            display_name="FinanceBench",
+            description="FinanceBench is a benchmark for open book financial question answering. It "
+            "comprises 10,231 questions about publicly traded companies, with corresponding "
+            "answers and evidence strings [(Islam et al., "
+            "2023)](https://arxiv.org/abs/2311.11944/).",
+            taxonomy=TaxonomyInfo(
+                task="question answering with numeric reasoning",
+                what="financial reports",
+                when="2015 to 2023",
+                who="financial experts",
+                language="English",
+            ),
+            main_metric="annotation_financebench_label_correct_answer",
+            main_split="test",
+        )

helm/benchmark/scenarios/gsm_scenario.py CHANGED Viewed

@@ -71,12 +71,18 @@ class GSM8KScenario(Scenario):
     def get_metadata(self) -> ScenarioMetadata:
         return ScenarioMetadata(
             name="gsm",
-            display_name="GSM8K (Grade school math word problems)",
+            display_name="GSM8K (Grade School Math)",
             short_display_name="GSM8K",
             description="The grade school math word problems dataset (GSM8K) for testing mathematical "
             "reasoning on grade-school math problems [(Cobbe et al., "
             "2021)](https://arxiv.org/pdf/2110.14168.pdf).",
-            taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
-            main_metric="exact_match_indicator",
+            taxonomy=TaxonomyInfo(
+                task="numeric answer question answering",
+                what="grade school math word problems",
+                when="2021",
+                who="contractors on Upwork and Surge AI",
+                language="English",
+            ),
+            main_metric="final_number_exact_match",
             main_split="test",
         )

helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import pandas as pd
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
 class HarmBenchGCGTransferScenario(Scenario):
@@ -48,3 +49,13 @@ class HarmBenchGCGTransferScenario(Scenario):
             instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="harm_bench_gcg_transfer",
+            display_name="HarmBenchGCGTransfer",
+            description="HarmBenchGCGTransfer",
+            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            main_metric="safety_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/harm_bench_scenario.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import pandas as pd
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
 class HarmBenchScenario(Scenario):
@@ -57,3 +58,13 @@ class HarmBenchScenario(Scenario):
                 instance = Instance(input=input, split=TEST_SPLIT, references=references, sub_split=tag, id=id)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="harm_bench",
+            display_name="HarmBench",
+            description="HarmBench",
+            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            main_metric="safety_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import List
 from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
     CORRECT_TAG,
     TEST_SPLIT,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -88,3 +90,22 @@ class InfiniteBenchEnMCScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="infinite_bench_en_mc",
+            display_name="∞Bench En.MC",
+            description="∞Bench En.MC is a multiple-choice question answering task that requires "
+            "locating and processing information within a novel, performing reasoning "
+            "through aggregation or filtering to derive answers. ([Zhang et al., "
+            "2024](https://arxiv.org/abs/2402.13718))",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="Novels",
+                when="Before 2024",
+                who="Novel authors",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 from typing import List
 from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
     CORRECT_TAG,
     TEST_SPLIT,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -77,3 +79,20 @@ class InfiniteBenchEnSumScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="infinite_bench_en_sum",
+            display_name="∞Bench En.Sum",
+            description="∞Bench En.Sum is a summarization task that requires generating a concise "
+            "summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))",
+            taxonomy=TaxonomyInfo(
+                task="multi-hop question answering",
+                what="Novels",
+                when="Before 2024",
+                who="Novel authors",
+                language="English",
+            ),
+            main_metric="rouge_l",
+            main_split="test",
+        )

helm/benchmark/scenarios/legalbench_scenario.py CHANGED Viewed

@@ -149,15 +149,14 @@ class LegalBenchScenario(Scenario):
     def get_metadata(self) -> ScenarioMetadata:
         return ScenarioMetadata(
-            name="legalbench",
+            name=self.name,
             display_name="LegalBench",
-            description="LegalBench is a large collaboratively constructed benchmark of legal "
-            "reasoning. Five representative tasks are included here. See [(Guha et al, "
-            "2023)[https://arxiv.org/abs/2308.11462] for more details.",
+            description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
+            "tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
             taxonomy=TaxonomyInfo(
-                task="text classification",
-                what="fact patterns, questions, and legal documents",
-                when="n/a",
+                task="multiple-choice question answering",
+                what="public legal and admininstrative documents, manually " "constructed questions",
+                when="before 2023",
                 who="lawyers",
                 language="English",
             ),

helm/benchmark/scenarios/math_scenario.py CHANGED Viewed

@@ -454,14 +454,21 @@ class MATHScenario(Scenario):
         return instances
     def get_metadata(self) -> ScenarioMetadata:
+        taxonomy = TaxonomyInfo(
+            task="numeric answer question answering",
+            what="math competitions (AMC, AIME, etc.)",
+            when="before 2021",
+            who="problem setters",
+            language="synthetic",
+        )
         if self.use_chain_of_thought:
             return ScenarioMetadata(
                 name="math_chain_of_thought",
-                display_name="MATH (chain-of-thought)",
+                display_name="MATH",
                 description="The MATH benchmark for measuring mathematical problem solving on competition "
                 "math problems with chain-of-thought style reasoning [(Hendrycks et al., "
-                "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
-                taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+                "2021)](https://arxiv.org/pdf/2103.03874.pdf).",
+                taxonomy=taxonomy,
                 main_metric="math_equiv_chain_of_thought",
                 main_split="test",
             )
@@ -472,7 +479,7 @@ class MATHScenario(Scenario):
                 description="The MATH benchmark for measuring mathematical problem solving on competition "
                 "math problems [(Hendrycks et al., "
                 "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
-                taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
+                taxonomy=taxonomy,
                 main_metric="math_equiv",
                 main_split="test",
             )

helm/benchmark/scenarios/med_qa_scenario.py CHANGED Viewed

@@ -113,7 +113,13 @@ class MedQAScenario(Scenario):
             description="MedQA is an open domain question answering dataset composed of questions from "
             "professional medical board exams ([Jin et al. "
             "2020](https://arxiv.org/pdf/2009.13081.pdf)).",
-            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="US medical licensing exams",
+                when="before 2020",
+                who="problem setters",
+                language="English",
+            ),
             main_metric="quasi_exact_match",
             main_split="test",
         )

helm/benchmark/scenarios/medi_qa_scenario.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MediQAScenario(Scenario):
     name = "medi_qa"
     description = (
-        "MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate"
+        "MEDIQA is a benchmark designed to evaluate a model's ability to generate"
         "medically accurate answers to patient-generated questions. Each instance includes a"
         "consumer health question, a set of candidate answers (used in ranking tasks), relevance"
         "annotations, and optionally, additional context. The benchmark focuses on supporting"
@@ -124,7 +124,7 @@ class MediQAScenario(Scenario):
             "health communication.",
             taxonomy=TaxonomyInfo(
                 task="Text generation",
-                what="Retrieve and rank answers based on medical question " "understanding",
+                what="Generate medically accurate answers to patient-generated questions.",
                 when="Any",
                 who="Clinician, Medical Student",
                 language="English",

crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl