PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/melt_translation_scenario.py ADDED Viewed

@@ -0,0 +1,152 @@
+from typing import Any, Dict, List, Optional
+from datasets import load_dataset, Dataset
+from helm.common.hierarchical_logger import htrack_block
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    VALID_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class MELTTranslationScenario(Scenario):
+    name = "melt_translation"
+    description = "Machine Translation scenario."
+    tags = ["machine_translation"]
+    def __init__(
+        self,
+        dataset_name: str,
+        revision: str,
+        source_language: str,
+        target_language: str,
+        subset: Optional[str] = None,
+        splits: Optional[Dict[str, str]] = None,
+    ):
+        """Initializes the question answering scenario.
+        Args:
+            dataset_name: The name of the dataset.
+            revision: The revision of the dataset to use.
+            source_language: The source language to use.
+            target_language: The target language to use.
+            subset: The subset of the dataset to use. Defaults to "".
+            splits: The splits to use for the dataset. Defaults to None.
+        """
+        super().__init__()
+        self.MAX_TRAIN_INSTANCES = 20_000
+        valid_languages = set(["vi", "en"])
+        self.dataset_name = dataset_name
+        self.subset = subset
+        self.revision = revision
+        self.splits = splits
+        self.source_language = source_language
+        self.target_language = target_language
+        if self.source_language not in valid_languages or self.target_language not in valid_languages:
+            raise ValueError("Supported languages: vi, en.")
+        if self.source_language == self.target_language:
+            raise ValueError("The source language and the target language should be different.")
+        if self.source_language != "en" and self.target_language != "en":
+            raise ValueError("One of the languages should be English.")
+    def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
+        """
+        Helper for generating instances for a split.
+        Args:
+            splits (dict): Which splits to partition the data into.
+        Returns:
+            List[Instance]: Instances from the file for the specified split.
+        """
+        with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
+            hf_dataset: Any = load_dataset(
+                self.dataset_name,
+                self.subset,
+                revision=self.revision,
+                trust_remote_code=True,
+            )
+        instances: List[Instance] = []
+        for dataset_split_name, helm_split_name in splits.items():
+            if helm_split_name == TRAIN_SPLIT:
+                hf_dataset[dataset_split_name] = hf_dataset[dataset_split_name].shuffle(seed=42)[
+                    : self.MAX_TRAIN_INSTANCES
+                ]
+                hf_dataset[dataset_split_name] = Dataset.from_dict(hf_dataset[dataset_split_name])
+            for example in hf_dataset[dataset_split_name]:
+                source_sentence = example[self.source_language]
+                target_sentence = example[self.target_language]
+                instances.append(
+                    Instance(
+                        input=Input(text=source_sentence),
+                        references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
+                        split=helm_split_name,
+                    )
+                )
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        if self.splits is None:
+            splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        else:
+            splits = {}
+            if "train" in self.splits:
+                splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
+            if "validation" in self.splits:
+                splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
+            if "test" in self.splits:
+                splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
+        instances: List[Instance] = self.get_instances_for_splits(splits=splits)
+        return instances
+class MELTTranslationOPUS100Scenario(MELTTranslationScenario):
+    """
+    Scenario for the OPUS100 dataset.
+    """
+    name = "melt_translation_opus100"
+    description = "OPUS100 dataset for machine translation."
+    tags = ["machine_translation"]
+    def __init__(self, **kwargs):
+        super().__init__(
+            dataset_name="vietgpt/opus100_envi",
+            revision="45df06fb0b31edc882d7c8d34389261f995e5208",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+            **kwargs,
+        )
+class MELTTranslationPhoMTScenario(MELTTranslationScenario):
+    """
+    Scenario for the PhoMT dataset.
+    """
+    name = "melt_translation_phomt"
+    description = "PhoMT dataset for machine translation."
+    tags = ["machine_translation"]
+    def __init__(self, **kwargs):
+        super().__init__(
+            dataset_name="ura-hcmut/PhoMT",
+            revision="74386685db01dc038860ff0a90d9f5fbde284bf7",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+            **kwargs,
+        )

helm/benchmark/scenarios/mental_health_scenario.py CHANGED Viewed

@@ -9,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
     PassageQuestionInput,
     Output,
 )
+from helm.common.general import check_file_exists
 class MentalHealthScenario(Scenario):
@@ -48,10 +49,19 @@ class MentalHealthScenario(Scenario):
     """
     name = "mental_health"
-    description = "A dataset containing a counselor and mental health patient conversation, where the objective is to \
-                    generate an empathetic counselor response."
+    description = (
+        "MentalHealth is a benchmark focused on evaluating empathetic communication in"
+        "mental health counseling. It includes simulated conversations between patients"
+        "and counselors, where the task is to generate compassionate and appropriate counselor"
+        "responses. The benchmark assesses a model's ability to support patients emotionally"
+        "and meaningfully engage in therapeutic conversations."
+    )
     tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
         """
         Process the dialogue data into evaluation instances.
@@ -102,9 +112,10 @@ class MentalHealthScenario(Scenario):
         Returns:
             List[Instance]: List of processed instances for evaluation
         """
-        # Load the processed dialogue data
-        data_path = "/share/pi/nigam/data/medhelm/mental_health/processed_dialogues.csv"
-        dialogue_data = pd.read_csv(data_path)
+        check_file_exists(
+            self.data_path, msg=f"[MentalHealthScenario] Required data file not found: '{self.data_path}'"
+        )
+        dialogue_data = pd.read_csv(self.data_path)
         # Process into instances
         instances = self.process_dialogue_data(dialogue_data)

helm/benchmark/scenarios/mimic_bhc_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 from typing import Dict, List
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -59,15 +59,20 @@ class MIMICBHCScenario(Scenario):
     name = "mimic_bhc"
     description = (
-        "A summarization task using a curated collection of preprocessed discharge notes"
-        " paired with their corresponding brief hospital course (BHC) summaries."
+        "MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief"
+        "Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV,"
+        "each paired with its corresponding BHC summary. The benchmark evaluates a model's"
+        "ability to condense detailed clinical information into accurate, concise summaries that"
+        "reflect the patient's hospital stay."
     )
     tags = ["summarization", "biomedical"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/share/pi/nigam/data/bhc-mimiciv/"
-        ensure_directory_exists(data_path)
-        data_path = data_path + "mimic_iv_bhc.json"
+        check_file_exists(self.data_path, msg=f"[MIMICBHCScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
         # Limit to zero shot setting for now
@@ -77,7 +82,7 @@ class MIMICBHCScenario(Scenario):
             "test": TEST_SPLIT,
         }
-        with open(data_path, "r") as f:
+        with open(self.data_path, "r") as f:
             data = [json.loads(line) for line in f]
         for data_split, split in splits.items():

helm/benchmark/scenarios/mimic_rrs_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from typing import Dict, List
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -44,15 +44,18 @@ class MIMICRRSScenario(Scenario):
     name = "mimic_rrs"
     description = (
-        "A dataset containing radiology reports with findings sections from MIMIC-III paired with"
-        " their corresponding impression sections, used for generating radiology report summaries."
+        "MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III"
+        "database. It contains pairs of 'Findings' and 'Impression' sections, enabling evaluation"
+        "of a model's ability to summarize diagnostic imaging observations into concise, clinically"
+        "relevant conclusions."
     )
     tags = ["question_answering", "biomedical"]
-    def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/share/pi/nigam/data/rrs-mimiciii/all"
-        ensure_directory_exists(data_path)
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+    def get_instances(self, output_path: str) -> List[Instance]:
         instances: List[Instance] = []
         # Limit to zero shot setting for now
         splits: Dict[str, str] = {
@@ -64,8 +67,14 @@ class MIMICRRSScenario(Scenario):
         for data_split, split in splits.items():
             split_findings_name: str = f"{data_split}.findings.tok"
             split_impressions_name: str = f"{data_split}.impression.tok"
-            findings_path: str = os.path.join(data_path, split_findings_name)
-            impressions_path: str = os.path.join(data_path, split_impressions_name)
+            findings_path: str = os.path.join(self.data_path, split_findings_name)
+            impressions_path: str = os.path.join(self.data_path, split_impressions_name)
+            check_file_exists(
+                findings_path, msg=f"[MIMICRRSScenario] Required findings file not found: '{findings_path}'"
+            )
+            check_file_exists(
+                impressions_path, msg=f"[MIMICRRSScenario] Required impressions file not found: '{impressions_path}'"
+            )
             findings: List[str] = self.read_file(findings_path)
             impressions: List[str] = self.read_file(impressions_path)
             assert len(findings) == len(impressions), "Findings and impressions must have the same length"

helm/benchmark/scenarios/mimiciv_billing_code_scenario.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import os
 import pandas as pd
 import numpy as np
 from typing import List
-from helm.common.general import ensure_directory_exists
+from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
     Scenario,
@@ -24,21 +23,28 @@ class MIMICIVBillingCodeScenario(Scenario):
     """
     name = "mimiciv_billing_code"
-    description = "A  dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes."
+    description = (
+        "MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the"
+        "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task"
+        "requires models to extract structured billing codes based on free-text clinical notes,"
+        "reflecting real-world hospital coding tasks for financial reimbursement."
+    )
     tags = ["question_answering", "biomedical"]
-    def __init__(self, data_file: str):
+    def __init__(self, data_path: str):
         """
-        :param data_file: Path to the mimiciv_icd10.feather file.
+        :param data_path: Path to the mimiciv_icd10.feather file.
         """
         super().__init__()
-        self.data_file = data_file
+        self.data_path = data_path
     def get_instances(self, output_path: str) -> List[Instance]:
-        ensure_directory_exists(os.path.dirname(self.data_file))
+        check_file_exists(
+            self.data_path, msg=f"[MIMICIVBilligCodeScenario] Required data file not found: '{self.data_path}'"
+        )
         # Read the preprocessed MIMIC-IV data (.feather format)
-        df = pd.read_feather(self.data_file)  # columns: ["text", "target", ...]
+        df = pd.read_feather(self.data_path)  # columns: ["text", "target", ...]
         instances: List[Instance] = []

helm/benchmark/scenarios/mmlu_pro_scenario.py CHANGED Viewed

@@ -40,7 +40,7 @@ class MMLUProScenario(Scenario):
     def __init__(self, subject: str):
         super().__init__()
-        self.subject: str = subject
+        self.subject: str = subject.replace("_", " ")
     def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
         """

helm/benchmark/scenarios/mtsamples_procedures_scenario.py CHANGED Viewed

@@ -31,9 +31,12 @@ class MTSamplesProceduresScenario(Scenario):
     GITHUB_DIR_URL = f"https://github.com/raulista1997/benchmarkdata/tree/{GIT_HASH}/mtsample_procedure"
     RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsample_procedure/"
-    name = "mtsamples"
+    name = "mtsamples_procedures"
     description = (
-        "A dataset that provides a patient note regarding an operation, with the objective to document the procedure."
+        "MTSamples Procedures is a benchmark composed of transcribed operative notes,"
+        "focused on documenting surgical procedures. Each example presents a brief patient case"
+        "involving a surgical intervention, and the model is tasked with generating a coherent"
+        "and clinically accurate procedural summary or treatment plan."
     )
     tags = ["medical", "transcription", "plan_generation"]

helm/benchmark/scenarios/mtsamples_replicate_scenario.py CHANGED Viewed

@@ -36,8 +36,9 @@ class MTSamplesReplicateScenario(Scenario):
     name = "mtsamples_replicate"
     description = (
-        "A dataset of clinical notes where the model is prompted to generate "
-        "a reasonable treatment plan for the patient based on transcribed medical reports."
+        "MTSamples Replicate is a benchmark that provides transcribed medical reports"
+        "from various specialties. It is used to evaluate a model's ability to generate clinically"
+        "appropriate treatment plans based on unstructured patient documentation"
     )
     tags = ["medical", "transcription", "plan_generation"]

helm/benchmark/scenarios/n2c2_ct_matching_scenario.py CHANGED Viewed

@@ -188,8 +188,13 @@ class N2C2CTMatchingScenario(Scenario):
     """
     name = "n2c2_ct_matching"
-    description = "A dataset that provides clinical notes and asks the model to classify whether the \
-    patient is a valid candidate for a provided clinical trial."
+    description = (
+        "N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
+        "appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
+        "note and a trial description. The model is tasked with determining whether the patient"
+        "is a valid candidate for the trial. This benchmark supports automation and decision"
+        "support in clinical research enrollment."
+    )
     tags = []  # TODO
     POSSIBLE_ANSWER_CHOICES: List[str] = [
@@ -197,11 +202,12 @@ class N2C2CTMatchingScenario(Scenario):
         "no",
     ]
-    def __init__(self, subject: str):
+    def __init__(self, data_path: str, subject: str):
         super().__init__()
         self.subject: str = subject  # specific inclusion criterion to assess
-        self.path_to_train_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/train/"
-        self.path_to_test_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/test/"
+        self.data_path: str = data_path
+        self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
+        self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
     def create_prompt(self, patient: Dict[str, Any]) -> str:
         # Cast None values to empty strings during string formatting, but keep the original functions returning None

helm/benchmark/scenarios/numeracy_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import List, Optional, Tuple, Dict
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.common.local_context import LocalContext
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 from helm.common.authentication import Authentication
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -39,7 +40,7 @@ except ModuleNotFoundError as e:
 #       https://github.com/stanford-crfm/benchmarking/issues/569
 def get_test_tokenizer_service() -> TokenizerService:
     # Pointed to the default local path set in run.py (--local-path)
-    return TokenizerService(ServerService(base_path="prod_env", root_mode=True), Authentication("test"))
+    return TokenizerService(LocalContext(base_path="prod_env"))
 SOLUTION_TAG: str = "solution"

helm/benchmark/scenarios/openai_mrcr_scenario.py ADDED Viewed

@@ -0,0 +1,79 @@
+import json
+import os
+import re
+from typing import List, Optional
+import datasets
+import tiktoken
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    Output,
+    Reference,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    Input,
+)
+from helm.common.general import ensure_directory_exists
+class OpenAIMRCRScenario(Scenario):
+    """OpenAI MRCR scenario
+    OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
+    an LLM's ability to distinguish between multiple needles hidden in context. This eval is
+    inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
+    The task is as follows: The model is given a long, multi-turn, synthetically generated
+    conversation between user and model where the user asks for a piece of writing about a topic,
+    e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
+    are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
+    of one of those asks. For example, "Return the 2nd poem about tapirs".
+    Reference: https://huggingface.co/datasets/openai/mrcr"""
+    name = "openai_mrcr"
+    description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)."  # noqa: E501
+    tags = ["long_context", "mrcr"]
+    NEEDLES_OPTIONS = [2, 4, 8]
+    def __init__(self, needles: int, max_num_words: Optional[int] = None):
+        super().__init__()
+        self.needles = needles
+        self.max_num_words = max_num_words
+        if needles not in self.NEEDLES_OPTIONS:
+            raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
+        self.tokenizer = tiktoken.get_encoding("o200k_base")
+    def count_words(self, messages: list[dict]) -> int:
+        return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "openai/mrcr",
+            cache_dir=cache_dir,
+            split="train",
+            data_files=[f"{self.needles}needle.parquet"],
+            revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
+        )
+        instances = []
+        for idx, row in enumerate(dataset):
+            messages = json.loads(row["prompt"])
+            if self.max_num_words and self.count_words(messages) > self.max_num_words:
+                continue
+            input = Input(messages=messages)
+            references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
+            instance = Instance(
+                id=f"{self.needles}needle{idx}",
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+                extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/pubmed_qa_scenario.py CHANGED Viewed

@@ -125,7 +125,12 @@ class PubMedQAScenario(Scenario):
     """
     name = "pubmed_qa"
-    description = "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions."
+    description = (
+        "PubMedQA is a biomedical question-answering dataset that evaluates a model's"
+        "ability to interpret scientific literature. It consists of PubMed abstracts paired with"
+        "yes/no/maybe questions derived from the content. The benchmark assesses a model's"
+        "capability to reason over biomedical texts and provide factually grounded answers."
+    )
     tags = ["question_answering", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]

helm/benchmark/scenarios/race_based_med_scenario.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import csv
+import os
-from filelock import FileLock
 from typing import Dict, List
 from docx import Document
@@ -13,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import ensure_file_downloaded
 def extract_red_text_runs(document):
@@ -87,10 +88,19 @@ class RaceBasedMedScenario(Scenario):
     """
     name = "race_based_med"
-    description = "A collection of LLM outputs in response to medical questions with race-based biases, \
-    with the objective being to classify whether the output contains racially biased content."
+    description = (
+        "RaceBias is a benchmark used to evaluate language models for racially biased or"
+        "inappropriate content in medical question-answering scenarios. Each instance consists"
+        "of a medical question and a model-generated response. The task is to classify whether"
+        "the response contains race-based, harmful, or inaccurate content. This benchmark"
+        "supports research into bias detection and fairness in clinical AI systems."
+    )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
+    FILE_URL: str = (
+        "https://static-content.springer.com/esm/"
+        "art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
+    )
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
@@ -114,12 +124,12 @@ class RaceBasedMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/share/pi/nigam/data/medhelm/race_based/race_based.csv"
         # Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
-        word_file = "/share/pi/nigam/data/medhelm/race_based/race_based.docx"
-        lock_path = data_path + ".lock"
-        with FileLock(lock_path):
-            # if not os.path.exists(data_path):
+        data_path = os.path.join(output_path, "race_based.csv")
+        if not os.path.exists(data_path):
+            word_file = os.path.join(output_path, "race_based.docx")
+            ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
             create_csv_from_word(word_file, data_path)
         instances: List[Instance] = []

helm/benchmark/scenarios/ruler_qa_scenario_helper.py CHANGED Viewed

@@ -133,7 +133,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
         input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
         # Calculate the number of tokens in the example
         total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
-        print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
+        # print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
         if total_tokens + tokens_to_generate > max_seq_length:
             num_docs -= incremental
             break
@@ -142,7 +142,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
         if num_docs > len(docs):
             num_docs = len(docs)
             break
-    print('Number of documents:', num_docs)
+    # print('Number of documents:', num_docs)
     # Generate samples
     for index in tqdm(range(num_samples)):

helm/benchmark/scenarios/ruler_qa_scenarios.py CHANGED Viewed

@@ -72,7 +72,7 @@ Question: {query} Answer:"""  # noqa: E501
 class RULERHotpotQAScenario(_RULERQAScenario):
     name = "ruler_hotpotqa"
-    description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
+    description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario."  # noqa: E501
     tags = ["long_context", "rag"]
     def __init__(self, max_num_words: int):
@@ -81,7 +81,7 @@ class RULERHotpotQAScenario(_RULERQAScenario):
 class RULERSQuADScenario(_RULERQAScenario):
     name = "ruler_squad"
-    description = "The SQuAD question answering scenario from RULER"
+    description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario."  # noqa: E501
     tags = ["long_context", "rag"]
     def __init__(self, max_num_words: int):

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl