PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py ADDED Viewed

@@ -0,0 +1,109 @@
+from typing import List, Tuple
+import os
+import json
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
+    """
+    Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
+    Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
+    Args:
+        directory: Path to the directory containing the files
+    Returns:
+        List of tuples where each tuple contains (mp3_path, json_path)
+    """
+    pairs = []
+    # Walk through all directories and subdirectories
+    for root, _, files in os.walk(directory):
+        # Get all MP3 files in current directory
+        mp3_files = [f for f in files if f.endswith(".mp3")]
+        for mp3_file in mp3_files:
+            base_name = os.path.splitext(mp3_file)[0]
+            json_file = f"{base_name}.json"
+            # Check if corresponding JSON file exists in the same directory
+            if json_file in files:
+                mp3_path = os.path.join(root, mp3_file)
+                json_path = os.path.join(root, json_file)
+                pairs.append((mp3_path, json_path))
+    return pairs
+class UltraSuiteDisorderSymptomsScenario(Scenario):
+    """
+    A scenario identifying features of speech disorders within the provided audio.
+    The audio files contain speech from children, potentially with an adult present.
+    """
+    name = "speech_disorder"
+    description = "A scenario for evaluating speech disorders in children"
+    tags = ["audio", "classification", "speech_disorder"]
+    HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
+    def get_instruction(self, words: str) -> str:
+        prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text."""  # noqa: E501
+        return prompt
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Create instances from the audio files and their corresponding JSON annotations.
+        The data directory should contain:
+        - Audio files (e.g., .mp3)
+        - A JSON file with annotations containing 'answer' field
+        """
+        print(f"Downloading dataset from {UltraSuiteDisorderSymptomsScenario.HF_MAPPING_URL} to {output_path}")
+        ensure_file_downloaded(source_url=UltraSuiteDisorderSymptomsScenario.HF_MAPPING_URL, target_path=output_path)
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        # Find all pairs of audio and JSON files
+        pairs = find_audio_json_pairs(output_path)
+        for audio_path, json_path in tqdm(pairs):
+            # Load the annotation
+            with open(json_path, "r") as f:
+                annotation = json.load(f)
+            # Get the correct answer and convert to label
+            if "disorder_symptom" not in annotation or "transcription" not in annotation:
+                continue
+            label = annotation["disorder_symptom"]
+            prompt = annotation["transcription"]
+            # Create references for each option
+            references: List[Reference] = []
+            for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
+                references.append(reference)
+            # Create the input with audio and instruction
+            content = [
+                MediaObject(content_type="audio/mpeg", location=audio_path),
+                MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
+            ]
+            input = Input(multimedia_content=MultimediaObject(content))
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py CHANGED Viewed

@@ -46,6 +46,10 @@ class VocalSoundScenario(Scenario):
     description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
     tags: List[str] = ["audio", "classification"]
+    def __init__(self, sound: str) -> None:
+        super().__init__()
+        self._sound: str = sound
     def get_instances(self, output_path: str) -> List[Instance]:
         instances: List[Instance] = []
         down_loading_path = os.path.join(output_path, "download")
@@ -53,7 +57,12 @@ class VocalSoundScenario(Scenario):
         wav_save_dir = os.path.join(down_loading_path, "audio_16k")
         for file_name in tqdm(os.listdir(wav_save_dir)):
             local_audio_path: str = os.path.join(wav_save_dir, file_name)
-            if not file_name.endswith(".wav") or is_invalid_audio_file(local_audio_path):
+            if (
+                not file_name.endswith(".wav")
+                or is_invalid_audio_file(local_audio_path)
+                # Skip this problematic file
+                or file_name == "m0083_0_sneeze.wav"
+            ):
                 continue
             input = Input(
@@ -61,9 +70,14 @@ class VocalSoundScenario(Scenario):
             )
             answer: str = file_name.split("_")[-1].split(".")[0]
+            if answer.lower() != self._sound:
+                continue
             if answer == "throatclearing":
                 answer = "throat clearing"
             references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
             instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        assert len(instances) > 0, f"No instances found for sound: {self._sound}"
         return instances

helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py CHANGED Viewed

@@ -40,8 +40,7 @@ class VoxCeleb2Scenario(Scenario):
         "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
     )
     IDENTITY_INSTRUCTION = (
-        "Listen to the audio and take your best guess to determine if the two speakers are the same person. "
-        "Give just the letter of your answer and nothing else."
+        "Listen to the audio and take your best guess to determine if the two speakers are the same person."
     )
     name = "voxceleb2"

helm/benchmark/scenarios/autobencher_capabilities_scenario.py CHANGED Viewed

@@ -12,7 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
 )
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 class AutoBencherCapabilitiesScenario(Scenario):
@@ -61,7 +61,7 @@ class AutoBencherCapabilitiesScenario(Scenario):
             # References are category ID, followed by level 2, 3 and 4 category names.
             references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
             if row["gold_answer"] is None:
-                hlog(f"WARNING: Row had no gold_answer: {row}")
+                hwarn(f"Row had no gold_answer: {row}")
                 continue
             instance = Instance(input=input, references=references, split=TEST_SPLIT)
             instances.append(instance)

helm/benchmark/scenarios/chw_care_plan_scenario.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import os
 import pandas as pd
 from typing import List
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -61,28 +60,30 @@ Clinical Note:
 class CHWCarePlanScenario(Scenario):
     """
-    A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
+    A scenario for a dataset containing free form text of a clinical health worker care plan, with the
+    associated goal being to restructure that text into a given format.
     - Input:  The clinical note (column "MO Note").
     - Output:  The clinical note (column "MO Note"). We will use this note as the reference for entailment.
     """
     name = "chw_care_plan"
-    description = "A dataset containing free form text of a clinical health worker care plan, with the \
-    associated goal being to restructure that text into a given format."
+    description = (
+        "NoteExtract is a benchmark that focuses on the structured extraction of information"
+        "from free-form clinical text. It provides care plan notes authored by health workers"
+        "and evaluates a model's ability to convert them into a predefined structured format,"
+        "such as fields for Chief Complaint and History of Present Illness. The benchmark"
+        "emphasizes faithful extraction without hallucination or inference."
+    )
     tags = ["question_answering", "biomedical"]
-    def __init__(self):
-        """
-        :param data_file: Path to the mimiciv_icd10.feather file.
-        """
+    def __init__(self, data_path: str):
         super().__init__()
-        self.data_file = "/share/pi/nigam/datasets/CHW_Dataset.csv"
+        self.data_path = data_path
     def get_instances(self, output_path: str) -> List[Instance]:
-        ensure_directory_exists(os.path.dirname(self.data_file))
-        df = pd.read_csv(self.data_file)  # columns: ["text", "target", ...]
+        check_file_exists(self.data_path, msg=f"[CHWCarePlanScenario] Required data file not found: '{self.data_path}'")
+        df = pd.read_csv(self.data_path)  # columns: ["text", "target", ...]
         instances: List[Instance] = []

helm/benchmark/scenarios/clear_scenario.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import pandas as pd
 from typing import List
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -76,16 +76,21 @@ class CLEARScenario(Scenario):
         "unemployment": "unemployment",
     }
-    def __init__(self, condition: str):
+    def __init__(self, condition: str, data_path: str):
         """Initialize the scenario with a specific medical condition"""
         super().__init__()
         if condition not in self.CONDITIONS:
             raise ValueError(f"Condition '{condition}' not supported. Available conditions: {self.CONDITIONS}")
+        self.data_path = data_path
         self.condition = condition
         self.name = f"clear_{condition}"
-        self.description = f"A dataset for evaluating {self.CONDITION_PROMPTS[condition]} detection from patient notes with yes/no/maybe classifications."  # noqa: E501
+        self.description = (
+            "CLEAR is a benchmark designed to evaluate models on their ability to detect medical"
+            "conditions from patient notes using categorical responses. Each instance consists of"
+            "a clinical note and a target condition, requiring the model to classify the patient's"
+            "history as either affirmative, negative, or uncertain."
+        )  # noqa: E501
         self.tags = ["classification", "biomedical", condition.replace("_", "-")]
     def get_answer_choices(self) -> List[str]:
@@ -95,9 +100,8 @@ class CLEARScenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         """Load and process the data for the specified conditon."""
-        data_dir = "/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/"
-        excel_path = os.path.join(data_dir, f"{self.condition}.xlsx")
-        ensure_directory_exists(os.path.dirname(excel_path))
+        excel_path = os.path.join(self.data_path, f"{self.condition}.xlsx")
+        check_file_exists(excel_path, msg=f"[CLEARScenario] Required data file not found: '{excel_path}'")
         df = pd.read_excel(excel_path)

helm/benchmark/scenarios/dischargeme_scenario.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import List
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -21,26 +21,34 @@ def file_preprocessing(data_path: str, task_objective: str) -> pd.DataFrame:
     data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
     """
     # Load the first CSV file
-    df_diagnosis = pd.read_csv(
-        f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz", compression="gzip", keep_default_na=False
+    diagnosis_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz"
+    check_file_exists(
+        diagnosis_path, msg=f"[DischargeMeScenario] Required diagnosis file not found: '{diagnosis_path}'"
     )
-    df_discharge = pd.read_csv(
-        f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz", compression="gzip", keep_default_na=False
+    discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz"
+    check_file_exists(
+        discharge_path, msg=f"[DischargeMeScenario] Required discharge file not found: '{discharge_path}'"
     )
+    target_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz"
+    check_file_exists(target_path, msg=f"[DischargeMeScenario] Required target file not found: '{target_path}'")
+    radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz"
+    check_file_exists(
+        radiology_path, msg=f"[DischargeMeScenario] Required radiology file not found: '{radiology_path}'"
+    )
+    ed_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz"
+    check_file_exists(ed_path, msg=f"[DischargeMeScenario] Required ed file not found: '{ed_path}'")
+    triage_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz"
+    check_file_exists(triage_path, msg=f"[DischargeMeScenario] Required triage file not found: '{triage_path}'")
+    df_diagnosis = pd.read_csv(diagnosis_path, compression="gzip", keep_default_na=False)
+    df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False)
     df_target = pd.read_csv(
-        f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz",
+        target_path,
         compression="gzip",
         keep_default_na=False,
     )
-    df_radiology = pd.read_csv(
-        f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz", compression="gzip", keep_default_na=False
-    )
-    df_ed = pd.read_csv(
-        f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz", compression="gzip", keep_default_na=False
-    )
-    df_triage = pd.read_csv(
-        f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz", compression="gzip", keep_default_na=False
-    )
+    df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False)
+    df_ed = pd.read_csv(ed_path, compression="gzip", keep_default_na=False)
+    df_triage = pd.read_csv(triage_path, compression="gzip", keep_default_na=False)
     df_diagnosis_triage = pd.merge(
         df_diagnosis, df_triage, on="subject_id", how="inner", suffixes=("_df_diagnosis", "_df_triage")
     )
@@ -113,16 +121,23 @@ class DischargeMeScenario(Scenario):
     """
     name = "dischargeme"
-    description = "DischargeMe is a discharge instruction generation dataset and brief hospital course generation \
-    dataset collected from MIMIC-IV data, consindering only the discharge text as well as the radiology report text."
+    description = (
+        "DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs"
+        "discharge summaries and radiology reports from MIMIC-IV with generation tasks"
+        "such as writing discharge instructions or summarizing the brief hospital course. The"
+        "benchmark assesses a model's ability to generate patient-facing documentation that is"
+        "complete, empathetic, and clinically accurate."
+    )
     tags = ["biomedical"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/share/pi/nigam/data/physionet.org"
-        ensure_directory_exists(data_path)
         instances: List[Instance] = []
-        df_bhc = file_preprocessing(data_path, "brief_hospital_course")
-        df_di = file_preprocessing(data_path, "discharge_instructions")
+        df_bhc = file_preprocessing(self.data_path, "brief_hospital_course")
+        df_di = file_preprocessing(self.data_path, "discharge_instructions")
         for i in range(df_bhc.shape[0]):
             prompt_bhc = create_prompt(

helm/benchmark/scenarios/ehr_sql_scenario.py CHANGED Viewed

@@ -36,7 +36,13 @@ class EhrSqlScenario(Scenario):
     )
     name = "ehr_sql"
-    description = "Given a natural language instruction, generate an SQL query that would be used in clinical research."
+    description = (
+        "EHRSQL is a benchmark designed to evaluate models on generating structured queries"
+        "for clinical research. Each example includes a natural language question and a database"
+        "schema, and the task is to produce an SQL query that would return the correct result"
+        "for a biomedical research objective. This benchmark assesses a model's understanding"
+        "of medical terminology, data structures, and query construction."
+    )
     tags = ["sql", "medical", "reasoning"]
     def setup_database(self, output_path: str) -> str:

helm/benchmark/scenarios/ehrshot_scenario.py CHANGED Viewed

@@ -3,12 +3,11 @@ import os
 import pandas as pd
 import tiktoken
-from filelock import FileLock
 from functools import partial
 from tqdm import tqdm
 from typing import Any, Dict, List, Optional, Mapping
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists, ensure_directory_exists
 from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
@@ -1411,7 +1410,10 @@ class EHRSHOTScenario(Scenario):
     name = "ehrshot"
     description = (
-        "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not."
+        "EHRSHOT is a benchmark designed to evaluate a model's ability to predict future"
+        "clinical events using structured EHR data. Each instance contains a patient's"
+        "historical EHR data and a forward-looking clinical question about whether a particular"
+        "diagnosis, lab result, or hospital event will occur."
     )
     tags = []  # TODO
@@ -1420,24 +1422,32 @@ class EHRSHOTScenario(Scenario):
         "no",
     ]
-    def __init__(self, subject: str, max_length: Optional[int] = None):
+    def __init__(self, subject: str, data_path: str, max_length: Optional[int] = None):
         super().__init__()
         self.subject: str = subject  # same as "task" or "labeling_function"
-        self.path_to_meds_dir: str = "/share/pi/nigam/data/medhelm/ehrshot/meds/"
-        self.path_to_tmp_dir: str = "/share/pi/nigam/data/medhelm/ehrshot/prompts/"
         self.max_length = max_length
+        self.data_path = data_path
-    def create_benchmark(self, n_procs: int = 4) -> Dict[str, str]:
+    def create_benchmark(self, output_path: str, n_procs: int = 4) -> Dict[str, str]:
         """Loads the MEDS dataset and converts it to prompts"""
         # Load MEDS EHRSHOT patient timelines
-        df_data = pd.read_parquet(os.path.join(self.path_to_meds_dir, "data/data.parquet"))
-        df_splits = pd.read_parquet(os.path.join(self.path_to_meds_dir, "metadata/subject_splits.parquet"))
+        data_parquet_path = os.path.join(self.data_path, "data/data.parquet")
+        check_file_exists(
+            data_parquet_path, msg=f"[EHRSHOTScenario] Required parquet data file not found: '{data_parquet_path}'"
+        )
+        splits_parquet_path = os.path.join(self.data_path, "metadata/subject_splits.parquet")
+        check_file_exists(
+            splits_parquet_path, msg=f"[EHRSHOTScenario] Required splits file not found: '{splits_parquet_path}'"
+        )
+        df_data = pd.read_parquet(data_parquet_path)
+        df_splits = pd.read_parquet(splits_parquet_path)
         # Load MEDS EHRSHOT labels
-        tasks = sorted(os.listdir(os.path.join(self.path_to_meds_dir, "labels")))
+        tasks = sorted(os.listdir(os.path.join(self.data_path, "labels")))
         for t in tasks:
-            path_to_labels: str = os.path.join(self.path_to_meds_dir, "labels", t, "labels.parquet")
+            path_to_labels: str = os.path.join(self.data_path, "labels", t, "labels.parquet")
+            check_file_exists(
+                path_to_labels, msg=f"[EHRSHOTScenario] Required labels file not found: '{path_to_labels}'"
+            )
             if t != self.subject or not os.path.exists(path_to_labels):
                 continue
             df_labels = pd.read_parquet(path_to_labels)
@@ -1470,18 +1480,16 @@ class EHRSHOTScenario(Scenario):
         df_labels["prompt"] = prompts
         # Save to parquet
-        path_to_output_dir: str = os.path.join(self.path_to_tmp_dir, self.subject)
+        path_to_output_dir: str = os.path.join(output_path, self.subject)
         ensure_directory_exists(path_to_output_dir)
         df_labels.to_parquet(os.path.join(path_to_output_dir, "medhelm_prompts.parquet"))
         return {"status": "success"}
     def get_instances(self, output_path: str) -> List[Instance]:
-        path_to_input_csv: str = os.path.join(self.path_to_tmp_dir, self.subject, "medhelm_prompts.parquet")
-        lock_path = path_to_input_csv + ".lock"
-        with FileLock(lock_path):
-            if not os.path.exists(path_to_input_csv):
-                print(f"Creating benchmark from SCRATCH for {self.subject}...")
-                self.create_benchmark()  # Create benchmark from scratch
+        path_to_input_csv: str = os.path.join(output_path, self.subject, "medhelm_prompts.parquet")
+        if not os.path.exists(path_to_input_csv):
+            print(f"Creating benchmark from SCRATCH for {self.subject}...")
+            self.create_benchmark(output_path=output_path)  # Create benchmark from scratch
         # Load data for this task
         df = pd.read_parquet(path_to_input_csv)
@@ -1509,38 +1517,3 @@ class EHRSHOTScenario(Scenario):
             )
         return instances
-if __name__ == "__main__":
-    # Generate statistics on prompts
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    tqdm.pandas()
-    n_procs: int = 10
-    os.makedirs("./ehrshot_stats", exist_ok=True)
-    for t in TASK_FULL_NAMES.keys():
-        # Skip if already exists
-        if os.path.exists(f"./ehrshot_stats/{t}.txt"):
-            print(f"Skipping {t} because it already exists")
-            continue
-        # Create benchmark
-        scenario = EHRSHOTScenario(subject=t)
-        scenario.create_benchmark(n_procs=n_procs)
-        instances = scenario.get_instances("test.csv")
-        # Calculate prompt token stats
-        path_to_input_csv = os.path.join(scenario.path_to_tmp_dir, scenario.subject, "medhelm_prompts.parquet")
-        df = pd.read_parquet(path_to_input_csv)
-        df["prompt_n_tokens"] = df["prompt"].progress_apply(lambda x: len(tokenizer.encode(x)))
-        with open(f"./ehrshot_stats/{t}.txt", "w") as f:
-            f.write("-" * 100 + "\n")
-            f.write(f"Task: {t}\n")
-            f.write(f"# of instances: {len(instances)}\n")
-            f.write(f"# of positives: {df['boolean_value'].sum()}\n")
-            f.write(f"Size of splits:\n{df['split'].value_counts()}\n")
-            f.write(f"# tokens per prompt:\n{df['prompt_n_tokens'].describe()}\n")
-            f.write("-" * 100 + "\n")
-        df.to_parquet(os.path.join(scenario.path_to_tmp_dir, scenario.subject, "medhelm_prompts.parquet"))

helm/benchmark/scenarios/grammar.py CHANGED Viewed

@@ -2,7 +2,7 @@ from collections import defaultdict
 from dataclasses import dataclass, field, replace
 from functools import cached_property
 from typing import List, Optional
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 import dacite
 import re
@@ -111,7 +111,7 @@ def validate_grammar(grammar: Grammar):
             # Make sure all categories are defined
             for category in expansion.categories:
                 if category not in grammar.category_to_rules:
-                    hlog(f"WARNING: Category {category} is not defined")
+                    hwarn(f"Category {category} is not defined")
 def read_grammar(path: str) -> Grammar:

helm/benchmark/scenarios/headqa_scenario.py CHANGED Viewed

@@ -57,7 +57,12 @@ class HeadQAScenario(Scenario):
     SKIP_TEXTQA: bool = False
     name = "head_qa"
-    description = "A collection of biomedical multiple-choice questions for testing medical knowledge."
+    description = (
+        "HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to"
+        "evaluate a model's medical knowledge and reasoning. Each instance presents a clinical"
+        "or scientific question with four answer options, requiring the model to select the most"
+        "appropriate answer."
+    )
     tags = ["question_answering", "biomedical", "medicine"]
     def __init__(self, language: str = "en", category: Optional[str] = None):

helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py ADDED Viewed

@@ -0,0 +1,85 @@
+import os
+import re
+from typing import List
+from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+class InfiniteBenchEnQAScenario(Scenario):
+    """InfiniteBench En.QA
+    InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
+    understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
+    InfiniteBench that requires models to perform open-form question answering on questions that necessitate
+    long-range dependency and reasoning, beyond simple short passage retrieval.
+    """
+    name = "infinite_bench_en_qa"
+    description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    tags = ["question_answering"]
+    def __init__(self, max_num_words: int):
+        self.max_num_words = max_num_words
+        super().__init__()
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get InfiniteBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Define the features schema
+        ft = Features(
+            {
+                "id": Value("int64"),
+                "context": Value("string"),
+                "input": Value("string"),
+                "answer": Sequence(Value("string")),
+                "options": Sequence(Value("string")),
+            }
+        )
+        # Load the dataset with the specified features
+        dataset = load_dataset(
+            "xinrongzhang2022/InfiniteBench",
+            split="longbook_qa_eng",
+            features=ft,
+            revision="90f0394333616266d9fe85824ceaf505093cbaa5",
+        )
+        assert isinstance(dataset, Dataset)
+        def count_words(text: str) -> int:
+            return len(re.split(r"\s+", text.strip()))
+        dataset = dataset.filter(
+            lambda example: count_words(example["context"])
+            + count_words(example["input"])
+            + sum(count_words(option) for option in example["options"])
+            <= self.max_num_words
+        )
+        # Read all instances
+        instances: List[Instance] = []
+        for row in dataset:
+            id = row["id"]
+            input = Input(text=row["context"] + "\n\n" + row["input"])
+            instance = Instance(
+                id=id,
+                input=input,
+                references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl