PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/ruler_qa_scenario_helper.py CHANGED Viewed

@@ -133,7 +133,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
         input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
         # Calculate the number of tokens in the example
         total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
-        print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
+        # print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
         if total_tokens + tokens_to_generate > max_seq_length:
             num_docs -= incremental
             break
@@ -142,7 +142,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
         if num_docs > len(docs):
             num_docs = len(docs)
             break
-    print('Number of documents:', num_docs)
+    # print('Number of documents:', num_docs)
     # Generate samples
     for index in tqdm(range(num_samples)):

helm/benchmark/scenarios/ruler_qa_scenarios.py CHANGED Viewed

@@ -72,7 +72,7 @@ Question: {query} Answer:"""  # noqa: E501
 class RULERHotpotQAScenario(_RULERQAScenario):
     name = "ruler_hotpotqa"
-    description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
+    description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario."  # noqa: E501
     tags = ["long_context", "rag"]
     def __init__(self, max_num_words: int):
@@ -81,7 +81,7 @@ class RULERHotpotQAScenario(_RULERQAScenario):
 class RULERSQuADScenario(_RULERQAScenario):
     name = "ruler_squad"
-    description = "The SQuAD question answering scenario from RULER"
+    description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario."  # noqa: E501
     tags = ["long_context", "rag"]
     def __init__(self, max_num_words: int):

helm/benchmark/scenarios/seahelm_scenario.py CHANGED Viewed

@@ -1750,7 +1750,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
                 text_noun = self.prompt_components["text_noun"]
                 instruction = self.prompt_components["single_instruction"]
-                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
                     question=question.format(row["question_translated"]),
                     text_noun=text_noun,
                     text=row["text"],
@@ -1898,7 +1898,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
                 text_noun = self.prompt_components["text_noun"]
                 instruction = self.prompt_components["single_instruction"]
-                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
                     question=question.format(row["question_translated"]),
                     text_noun=text_noun,
                     text=row["text"],

helm/benchmark/scenarios/shc_bmt_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -23,13 +24,19 @@ class SHCBMTMedScenario(Scenario):
     name = "shc_bmt_med"
     description = (
-        "A dataset containing patient notes with associated "
-        "questions and answers related to bone marrow transplantation."
+        "BMT-Status is a benchmark composed of clinical notes and associated binary questions"
+        "related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
+        "or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
+        "patient received a subsequent transplant based on the provided clinical documentation."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         with open(csv_path, "r") as file:
@@ -39,7 +46,7 @@ class SHCBMTMedScenario(Scenario):
                 context = row["context"]
                 answer = row["label"]
                 prompt = (
-                    f"Provide an answer to the following {question} with the following context: {context} "
+                    f"Provide an answer to the following question: {question} with the following context: {context} "
                     ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
                     "details or response, just a simple A or B response."
                 )
@@ -47,10 +54,9 @@ class SHCBMTMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-BMT-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_cdi_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -24,13 +25,18 @@ class SHCCDIMedScenario(Scenario):
     name = "shc_cdi_med"
     description = (
-        "A dataset built from Clinical Document Integrity (CDI) notes, to assess "
-        "the ability to answer verification questions from previous notes."
+        "CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
+        "notes. It is used to evaluate a model's ability to verify clinical conditions based on"
+        "documented evidence in patient records."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         with open(csv_path, "r") as file:
@@ -40,7 +46,7 @@ class SHCCDIMedScenario(Scenario):
                 context = row["context"]
                 answer = row["label"]
                 prompt = (
-                    f"Provide an answer to the following {question} with the following context: {context} , "
+                    f"Provide an answer to the following question: {question} with the following context: {context} , "
                     "Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
                     "additional details or response, just a simple A or B response."
                 )
@@ -48,10 +54,9 @@ class SHCCDIMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CDI-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_conf_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -24,13 +25,19 @@ class SHCCONFMedScenario(Scenario):
     name = "shc_conf_med"
     description = (
-        "A dataset of clinical notes from adolescent patients used to identify sensitive "
-        "protected health information that should be restricted from parental access."
+        "MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
+        "used to evaluate whether the content contains sensitive protected health information"
+        "(PHI) that should be restricted from parental access, in accordance with adolescent"
+        "confidentiality policies in clinical care."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         with open(csv_path, "r") as file:
@@ -40,7 +47,7 @@ class SHCCONFMedScenario(Scenario):
                 context = row["context"]
                 answer = row["label"]
                 prompt = (
-                    f"Provide an answer to the following {question} with the following context: {context} "
+                    f"Provide an answer to the following question: {question} with the following context: {context} "
                     ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
                     "details or response, just a simple A or B response."
                 )
@@ -48,10 +55,9 @@ class SHCCONFMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CONF-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_ent_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -23,13 +24,18 @@ class SHCENTMedScenario(Scenario):
     name = "shc_ent_med"
     description = (
-        "A dataset designed to evaluate performance in "
-        "identifying appropriate patient referrals to Ear, Nose, and Throat specialists."
+        "ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
+        "supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
+        "models' abilities to make referral decisions based on unstructured clinical text."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         counter = 1
@@ -41,7 +47,7 @@ class SHCENTMedScenario(Scenario):
                     context = row["context"]
                     answer = row["label"]
                     prompt = (
-                        f"{counter} Provide an answer to the following {question} with the following context:"
+                        f"{counter} Provide an answer to the following question: {question} with the following context:"
                         f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
                         " Do not provide any additional details or response, just a simple A, B, or C response."
                     )
@@ -50,10 +56,9 @@ class SHCENTMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-ENT-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_gip_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -22,11 +23,19 @@ class SHCGIPMedScenario(Scenario):
     """
     name = "shc_gip_med"
-    description = "A dataset evaluating performance in identifying appropriate patient referrals to hospice care."
+    description = (
+        "HospiceReferral is a benchmark that evaluates model performance in identifying"
+        "whether patients are eligible for hospice care based on palliative care clinical notes."
+        "The benchmark focuses on end-of-life care referral decisions."
+    )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         with open(csv_path, "r") as file:
@@ -36,7 +45,7 @@ class SHCGIPMedScenario(Scenario):
                 context = row["context"]
                 answer = row["label"]
                 prompt = (
-                    f"Provide an answer to the following {question} with the following context: {context} "
+                    f"Provide an answer to the following question: {question} with the following context: {context} "
                     ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
                     "details or response, just a simple A or B response."
                 )
@@ -44,10 +53,9 @@ class SHCGIPMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-GIP-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_privacy_scenario.py ADDED Viewed

@@ -0,0 +1,78 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+from helm.common.general import check_file_exists
+csv.field_size_limit(sys.maxsize)
+class SHCPRIVACYMedScenario(Scenario):
+    """
+    This dataset features messages sent generated by an LLM from patient clinical notes data.
+    The scenario evaluates the ability of an LLM to determine if any potentially confidential
+    information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
+    """
+    name = "shc_privacy_med"
+    description = (
+        "PrivacyDetection is a benchmark composed of patient portal messages submitted by"
+        "patients or caregivers. The task is to determine whether the message contains any"
+        "confidential or privacy-leaking information that should be protected."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    "You are reviewing clinical messages in order to determine if any confidential "
+                    f"information was included. Please answer the following question: {question} with the "
+                    f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
+                    "for no. Do not provide any additional details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        check_file_exists(
+            self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
+        )
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(self.data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_proxy_scenario.py ADDED Viewed

@@ -0,0 +1,76 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+from helm.common.general import check_file_exists
+csv.field_size_limit(sys.maxsize)
+class SHCPROXYMedScenario(Scenario):
+    """
+    This dataset features messages sent by proxy users and non proxy users, for evaluation of
+    LLM capabilities to determine the sender. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
+    """
+    name = "shc_proxy_med"
+    description = (
+        "ProxySender is a benchmark composed of patient portal messages received by clinicians."
+        "It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent,"
+        "spouse), which is critical for understanding who is communicating with healthcare"
+        "providers."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    "You are reviewing a clinical messages in order to determine if they have been "
+                    f"sent by a proxy user. Please determine the following: {question} with the "
+                    f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
+                    "for no. Do not provide any additional details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        check_file_exists(self.data_path, msg=f"[SHCPROXYMedScenario] Required data file not found: '{self.data_path}'")
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(self.data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_ptbm_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -27,14 +28,19 @@ class SHCPTBMMedScenario(Scenario):
     name = "shc_ptbm_med"
     description = (
-        "A dataset that classifies whether a clinical note contains a clinician "
-        "recommendation for parent training in behavior management, which is the first-line "
-        "evidence-based treatment for young children with ADHD."
+        "ADHD-Behavior is a benchmark that evaluates a model’s ability to detect whether"
+        "a clinician recommends parent training in behavior management, an evidence-based"
+        "first-line treatment for young children diagnosed with ADHD. Each instance includes"
+        "a clinical note from a pediatric visit and a binary classification task."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         with open(csv_path, "r") as file:
@@ -46,7 +52,7 @@ class SHCPTBMMedScenario(Scenario):
                 prompt = (
                     "You are reviewing a clinical note from health records of children with "
                     "attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
-                    f"behavioral therapy. Provide an answer to the following {question} with the "
+                    f"behavioral therapy. Provide an answer to the following question: {question} with the "
                     f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
                     "for no. Do not provide any additional details or response, just a simple A or B response."
                 )
@@ -54,10 +60,9 @@ class SHCPTBMMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-PTBM-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCPTBMMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_sei_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -30,14 +31,19 @@ class SHCSEIMedScenario(Scenario):
     name = "shc_sei_med"
     description = (
-        "A dataset that classifies whether a clinical note contains documentation "
-        "of side effect monitoring (recording of absence or presence of medication "
-        "side effects), as recommended in clinical practice guidelines."
+        "ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
+        "pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
+        "in clinical practice guidelines. The dataset supports binary classification"
+        "to detect presence or absence of side effect inquiries (SEI) within notes."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         with open(csv_path, "r") as file:
@@ -59,7 +65,7 @@ class SHCSEIMedScenario(Scenario):
                     "categorized as SEI because they consist of a plan or an explanation about "
                     "side effects without actual side effect monitoring taking place, and "
                     "No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
-                    f"Provide an answer to the following {question} with the following context: {context} "
+                    f"Provide an answer to the following question: {question} with the following context: {context} "
                     ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
                     "details or response, just a simple A or B response."
                 )
@@ -67,10 +73,9 @@ class SHCSEIMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-SEI-dataset_filtered.csv"
+        check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES

helm/benchmark/scenarios/shc_sequoia_scenario.py CHANGED Viewed

@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     Reference,
     Output,
 )
+from helm.common.general import check_file_exists
 csv.field_size_limit(sys.maxsize)
@@ -22,12 +23,18 @@ class SHCSequoiaMedScenario(Scenario):
     name = "shc_sequoia_med"
     description = (
-        "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic."
+        "ClinicReferral is a benchmark that determines patient eligibility for referral to the"
+        "Sequoia Clinic based on information from palliative care notes. The dataset provides"
+        "curated decisions on referral appropriateness to assist in automating clinic workflows."
     )
     tags = ["knowledge", "reasoning", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
     def create_benchmark(self, csv_path) -> Dict[str, str]:
         data = {}
         counter = 1
@@ -38,7 +45,7 @@ class SHCSequoiaMedScenario(Scenario):
                 context = row["context"]
                 answer = row["label"]
                 prompt = (
-                    f" {counter} Provide an answer to the following {question} with the following context:"
+                    f" {counter} Provide an answer to the following question: {question} with the following context:"
                     f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
                     "additional details or response, just a simple A or B response."
                 )
@@ -47,10 +54,11 @@ class SHCSequoiaMedScenario(Scenario):
         return data
     def get_instances(self, output_path: str) -> List[Instance]:
-        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-sequoia-dataset_filtered.csv"
+        check_file_exists(
+            self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
+        )
         instances: List[Instance] = []
-        benchmark_data = self.create_benchmark(data_path)
+        benchmark_data = self.create_benchmark(self.data_path)
         for prompt, answer in benchmark_data.items():
             assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl