PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/run_specs/medhelm_run_specs.py CHANGED Viewed

@@ -37,6 +37,7 @@ def get_medcalc_bench_spec() -> RunSpec:
         output_noun="Answer only the requested quantity without units. No explanation needed",
         max_tokens=10,
         max_train_instances=0,
+        stop_sequences=[],
     )
     metric_specs = [
@@ -56,9 +57,13 @@ def get_medcalc_bench_spec() -> RunSpec:
 @run_spec_function("clear")
-def get_clear_spec(condition: str) -> RunSpec:
+def get_clear_spec(condition: str, data_path: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario", args={"condition": condition}
+        class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario",
+        args={
+            "condition": condition,
+            "data_path": data_path,
+        },
     )
     condition_display = condition.replace("_", " ")
@@ -157,6 +162,7 @@ def get_medec_run_spec() -> RunSpec:
         output_noun="Answer",
         max_tokens=256,
         max_train_instances=0,
+        stop_sequences=[],
     )
     # Define the metrics
@@ -178,10 +184,14 @@ def get_medec_run_spec() -> RunSpec:
 @run_spec_function("ehrshot")
-def get_ehrshot_spec(subject: str, max_length: int = 100000) -> RunSpec:
+def get_ehrshot_spec(subject: str, data_path: str, max_length: int = 100000) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.ehrshot_scenario.EHRSHOTScenario",
-        args={"subject": subject, "max_length": max_length},
+        args={
+            "subject": subject,
+            "max_length": max_length,
+            "data_path": data_path,
+        },
     )
     adapter_spec = get_multiple_choice_adapter_spec(
@@ -320,9 +330,13 @@ def get_medbullets_freetext_run_spec() -> RunSpec:
 @run_spec_function("medalign")
-def get_medalign_spec(max_length: int = 40000) -> RunSpec:
+def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario", args={"max_length": max_length}
+        class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario",
+        args={
+            "max_length": max_length,
+            "data_path": data_path,
+        },
     )
     adapter_spec = get_generation_adapter_spec(
@@ -358,8 +372,11 @@ def get_medalign_spec(max_length: int = 40000) -> RunSpec:
 @run_spec_function("shc_ptbm_med")
-def get_shc_ptbm_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_ptbm_scenario.SHCPTBMMedScenario", args={})
+def get_shc_ptbm_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_ptbm_scenario.SHCPTBMMedScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -378,8 +395,11 @@ def get_shc_ptbm_spec() -> RunSpec:
 @run_spec_function("shc_sei_med")
-def get_shc_sei_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_sei_scenario.SHCSEIMedScenario", args={})
+def get_shc_sei_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_sei_scenario.SHCSEIMedScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -398,8 +418,13 @@ def get_shc_sei_spec() -> RunSpec:
 @run_spec_function("dischargeme")
-def get_dischargeme_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario")
+def get_dischargeme_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario",
+        args={
+            "data_path": data_path,
+        },
+    )
     adapter_spec = get_generation_adapter_spec(
         instructions=(
@@ -534,8 +559,11 @@ def get_mtsamples_procedures_spec() -> RunSpec:
 @run_spec_function("mimic_rrs")
-def get_mimic_rrs_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario", args={})
+def get_mimic_rrs_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_generation_adapter_spec(
         instructions=(
@@ -572,8 +600,11 @@ def get_mimic_rrs_spec() -> RunSpec:
 @run_spec_function("mimic_bhc")
-def get_mimic_bhc_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario", args={})
+def get_mimic_bhc_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_generation_adapter_spec(
         instructions=("Summarize the clinical note into a brief hospital course."),
@@ -585,23 +616,29 @@ def get_mimic_bhc_spec() -> RunSpec:
         max_train_instances=0,
         stop_sequences=[],
     )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.mimic_bhc_annotator.MIMICBHCAnnotator")]
     metric_args = {
         "task": "mimic_bhc",
         "device": get_torch_device_name(),
         "bertscore_model": "distilbert-base-uncased",
         "rescale_with_baseline": False,
     }
+    metric_specs = get_summarization_metric_specs(metric_args) + [
+        MetricSpec(class_name="helm.benchmark.metrics.mimic_bhc_metrics.MIMICBHCMetric", args={})
+    ]
     return RunSpec(
         name="mimic_bhc",
+        annotators=annotator_specs,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(metric_args),
+        metric_specs=metric_specs,
         groups=["mimic_bhc"],
     )
 @run_spec_function("chw_care_plan")
-def get_chw_care_plan_run_spec() -> RunSpec:
+def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
     """
     RunSpec for the chw_care_plan dataset.
     This configuration evaluates the model's ability to summarize
@@ -609,7 +646,7 @@ def get_chw_care_plan_run_spec() -> RunSpec:
     """
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.chw_care_plan_scenario.CHWCarePlanScenario",
-        args={},
+        args={"data_path": data_path},
     )
     adapter_spec = get_generation_adapter_spec(
@@ -681,10 +718,10 @@ def get_medication_qa_spec() -> RunSpec:
 @run_spec_function("starr_patient_instructions")
-def get_starr_patient_instructions_run_spec() -> RunSpec:
+def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.starr_patient_instructions_scenario.StarrPatientInstructionsScenario",
-        args={},
+        args={"data_path": data_path},
     )
     adapter_spec = get_generation_adapter_spec(
@@ -748,6 +785,7 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
         output_noun="Summary",
         max_tokens=80,
         max_train_instances=0,
+        stop_sequences=[],
     )
     annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator")]
@@ -771,8 +809,11 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
 @run_spec_function("shc_conf_med")
-def get_shc_conf_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_conf_scenario.SHCCONFMedScenario", args={})
+def get_shc_conf_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_conf_scenario.SHCCONFMedScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -824,13 +865,16 @@ def get_medi_qa_spec() -> RunSpec:
 @run_spec_function("mental_health")
-def get_mental_health_spec() -> RunSpec:
+def get_mental_health_spec(data_path: str) -> RunSpec:
     """
     Returns the run specification for the mental health counseling scenario.
     This scenario evaluates a model's ability to generate appropriate counseling responses
     in mental health conversations.
     """
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mental_health_scenario.MentalHealthScenario")
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mental_health_scenario.MentalHealthScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_generation_adapter_spec(
         instructions=(
@@ -840,6 +884,7 @@ def get_mental_health_spec() -> RunSpec:
         newline_after_input_noun=False,
         output_noun="Counselor response",
         max_tokens=512,
+        stop_sequences=[],
     )
     annotator_specs = [
         AnnotatorSpec(class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator")
@@ -871,7 +916,11 @@ def get_pubmed_qa_spec() -> RunSpec:
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions="Answer A for yes, B for no or C for maybe.",
+        instructions=(
+            "Answer A for yes, B for no or C for maybe. "
+            "Do not include any explanation or additional text. "
+            "Output only the letter on a single line."
+        ),
         input_noun="Question",
         output_noun="Answer",
         max_train_instances=0,
@@ -937,8 +986,11 @@ def get_ehr_sql_run_spec() -> RunSpec:
 @run_spec_function("shc_bmt_med")
-def get_shc_bmt_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_bmt_scenario.SHCBMTMedScenario", args={})
+def get_shc_bmt_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_bmt_scenario.SHCBMTMedScenario",
+        args={"data_path": data_path},
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1002,6 +1054,7 @@ No letter or word, just the integer value.
 Your Judgment"""  # noqa: E501
         ),
         max_train_instances=0,
+        stop_sequences=[],
     )
     return RunSpec(
@@ -1014,17 +1067,17 @@ Your Judgment"""  # noqa: E501
 @run_spec_function("n2c2_ct_matching")
-def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
+def get_n2c2_ct_matching_spec(data_path: str, subject: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.n2c2_ct_matching_scenario.N2C2CTMatchingScenario",
-        args={"subject": subject},
+        args={"data_path": data_path, "subject": subject},
     )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions="Answer A for yes, B for no.",
         input_noun="",
-        output_noun="Answer A for yes, B for no",
+        output_noun="Answer A for yes, B for no. Do not add any other text, punctuation, or symbols",
         max_train_instances=0,
     )
@@ -1038,8 +1091,10 @@ def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
 @run_spec_function("shc_gip_med")
-def get_shc_gip_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_gip_scenario.SHCGIPMedScenario", args={})
+def get_shc_gip_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_gip_scenario.SHCGIPMedScenario", args={"data_path": data_path}
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1058,11 +1113,11 @@ def get_shc_gip_spec() -> RunSpec:
 @run_spec_function("mimiciv_billing_code")
-def get_mimiciv_billing_code_spec() -> RunSpec:
+def get_mimiciv_billing_code_spec(data_path: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.mimiciv_billing_code_scenario.MIMICIVBillingCodeScenario",
         args={
-            "data_file": "/share/pi/nigam/data/medhelm/mimiciv_billing_codes/mimiciv_icd10.feather",
+            "data_path": data_path,
         },
     )
     adapter_spec = get_generation_adapter_spec(
@@ -1094,9 +1149,9 @@ def get_mimiciv_billing_code_spec() -> RunSpec:
 @run_spec_function("shc_sequoia_med")
-def get_shc_sequoia_spec() -> RunSpec:
+def get_shc_sequoia_spec(data_path: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={}
+        class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={"data_path": data_path}
     )
     adapter_spec = get_multiple_choice_adapter_spec(
@@ -1116,8 +1171,10 @@ def get_shc_sequoia_spec() -> RunSpec:
 @run_spec_function("shc_cdi_med")
-def get_shc_cdi_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCCDIMedScenario", args={})
+def get_shc_cdi_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCCDIMedScenario", args={"data_path": data_path}
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1136,8 +1193,10 @@ def get_shc_cdi_spec() -> RunSpec:
 @run_spec_function("shc_ent_med")
-def get_shc_ent_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_ent_scenario.SHCENTMedScenario", args={})
+def get_shc_ent_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_ent_scenario.SHCENTMedScenario", args={"data_path": data_path}
+    )
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1153,3 +1212,49 @@ def get_shc_ent_spec() -> RunSpec:
         metric_specs=get_exact_match_metric_specs(),
         groups=["shc_ent_med"],
     )
+@run_spec_function("shc_privacy_med")
+def get_shc_privacy_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPRIVACYMedScenario",
+        args={"data_path": data_path},
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Answer A or B.",
+        input_noun="",
+        output_noun="",
+    )
+    return RunSpec(
+        name="shc_privacy_med",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["shc_privacy_med"],
+    )
+@run_spec_function("shc_proxy_med")
+def get_shc_proxy_spec(data_path: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPROXYMedScenario",
+        args={"data_path": data_path},
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Answer A or B.",
+        input_noun="",
+        output_noun="",
+    )
+    return RunSpec(
+        name="shc_proxy_med",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["shc_proxy_med"],
+    )

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl