crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -37,6 +37,7 @@ def get_medcalc_bench_spec() -> RunSpec:
|
|
|
37
37
|
output_noun="Answer only the requested quantity without units. No explanation needed",
|
|
38
38
|
max_tokens=10,
|
|
39
39
|
max_train_instances=0,
|
|
40
|
+
stop_sequences=[],
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
metric_specs = [
|
|
@@ -56,9 +57,13 @@ def get_medcalc_bench_spec() -> RunSpec:
|
|
|
56
57
|
|
|
57
58
|
|
|
58
59
|
@run_spec_function("clear")
|
|
59
|
-
def get_clear_spec(condition: str) -> RunSpec:
|
|
60
|
+
def get_clear_spec(condition: str, data_path: str) -> RunSpec:
|
|
60
61
|
scenario_spec = ScenarioSpec(
|
|
61
|
-
class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario",
|
|
62
|
+
class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario",
|
|
63
|
+
args={
|
|
64
|
+
"condition": condition,
|
|
65
|
+
"data_path": data_path,
|
|
66
|
+
},
|
|
62
67
|
)
|
|
63
68
|
|
|
64
69
|
condition_display = condition.replace("_", " ")
|
|
@@ -157,6 +162,7 @@ def get_medec_run_spec() -> RunSpec:
|
|
|
157
162
|
output_noun="Answer",
|
|
158
163
|
max_tokens=256,
|
|
159
164
|
max_train_instances=0,
|
|
165
|
+
stop_sequences=[],
|
|
160
166
|
)
|
|
161
167
|
|
|
162
168
|
# Define the metrics
|
|
@@ -178,10 +184,14 @@ def get_medec_run_spec() -> RunSpec:
|
|
|
178
184
|
|
|
179
185
|
|
|
180
186
|
@run_spec_function("ehrshot")
|
|
181
|
-
def get_ehrshot_spec(subject: str, max_length: int = 100000) -> RunSpec:
|
|
187
|
+
def get_ehrshot_spec(subject: str, data_path: str, max_length: int = 100000) -> RunSpec:
|
|
182
188
|
scenario_spec = ScenarioSpec(
|
|
183
189
|
class_name="helm.benchmark.scenarios.ehrshot_scenario.EHRSHOTScenario",
|
|
184
|
-
args={
|
|
190
|
+
args={
|
|
191
|
+
"subject": subject,
|
|
192
|
+
"max_length": max_length,
|
|
193
|
+
"data_path": data_path,
|
|
194
|
+
},
|
|
185
195
|
)
|
|
186
196
|
|
|
187
197
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
@@ -320,9 +330,13 @@ def get_medbullets_freetext_run_spec() -> RunSpec:
|
|
|
320
330
|
|
|
321
331
|
|
|
322
332
|
@run_spec_function("medalign")
|
|
323
|
-
def get_medalign_spec(max_length: int = 40000) -> RunSpec:
|
|
333
|
+
def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
|
|
324
334
|
scenario_spec = ScenarioSpec(
|
|
325
|
-
class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario",
|
|
335
|
+
class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario",
|
|
336
|
+
args={
|
|
337
|
+
"max_length": max_length,
|
|
338
|
+
"data_path": data_path,
|
|
339
|
+
},
|
|
326
340
|
)
|
|
327
341
|
|
|
328
342
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -358,8 +372,11 @@ def get_medalign_spec(max_length: int = 40000) -> RunSpec:
|
|
|
358
372
|
|
|
359
373
|
|
|
360
374
|
@run_spec_function("shc_ptbm_med")
|
|
361
|
-
def get_shc_ptbm_spec() -> RunSpec:
|
|
362
|
-
scenario_spec = ScenarioSpec(
|
|
375
|
+
def get_shc_ptbm_spec(data_path: str) -> RunSpec:
|
|
376
|
+
scenario_spec = ScenarioSpec(
|
|
377
|
+
class_name="helm.benchmark.scenarios.shc_ptbm_scenario.SHCPTBMMedScenario",
|
|
378
|
+
args={"data_path": data_path},
|
|
379
|
+
)
|
|
363
380
|
|
|
364
381
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
365
382
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -378,8 +395,11 @@ def get_shc_ptbm_spec() -> RunSpec:
|
|
|
378
395
|
|
|
379
396
|
|
|
380
397
|
@run_spec_function("shc_sei_med")
|
|
381
|
-
def get_shc_sei_spec() -> RunSpec:
|
|
382
|
-
scenario_spec = ScenarioSpec(
|
|
398
|
+
def get_shc_sei_spec(data_path: str) -> RunSpec:
|
|
399
|
+
scenario_spec = ScenarioSpec(
|
|
400
|
+
class_name="helm.benchmark.scenarios.shc_sei_scenario.SHCSEIMedScenario",
|
|
401
|
+
args={"data_path": data_path},
|
|
402
|
+
)
|
|
383
403
|
|
|
384
404
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
385
405
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -398,8 +418,13 @@ def get_shc_sei_spec() -> RunSpec:
|
|
|
398
418
|
|
|
399
419
|
|
|
400
420
|
@run_spec_function("dischargeme")
|
|
401
|
-
def get_dischargeme_spec() -> RunSpec:
|
|
402
|
-
scenario_spec = ScenarioSpec(
|
|
421
|
+
def get_dischargeme_spec(data_path: str) -> RunSpec:
|
|
422
|
+
scenario_spec = ScenarioSpec(
|
|
423
|
+
class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario",
|
|
424
|
+
args={
|
|
425
|
+
"data_path": data_path,
|
|
426
|
+
},
|
|
427
|
+
)
|
|
403
428
|
|
|
404
429
|
adapter_spec = get_generation_adapter_spec(
|
|
405
430
|
instructions=(
|
|
@@ -534,8 +559,11 @@ def get_mtsamples_procedures_spec() -> RunSpec:
|
|
|
534
559
|
|
|
535
560
|
|
|
536
561
|
@run_spec_function("mimic_rrs")
|
|
537
|
-
def get_mimic_rrs_spec() -> RunSpec:
|
|
538
|
-
scenario_spec = ScenarioSpec(
|
|
562
|
+
def get_mimic_rrs_spec(data_path: str) -> RunSpec:
|
|
563
|
+
scenario_spec = ScenarioSpec(
|
|
564
|
+
class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario",
|
|
565
|
+
args={"data_path": data_path},
|
|
566
|
+
)
|
|
539
567
|
|
|
540
568
|
adapter_spec = get_generation_adapter_spec(
|
|
541
569
|
instructions=(
|
|
@@ -572,8 +600,11 @@ def get_mimic_rrs_spec() -> RunSpec:
|
|
|
572
600
|
|
|
573
601
|
|
|
574
602
|
@run_spec_function("mimic_bhc")
|
|
575
|
-
def get_mimic_bhc_spec() -> RunSpec:
|
|
576
|
-
scenario_spec = ScenarioSpec(
|
|
603
|
+
def get_mimic_bhc_spec(data_path: str) -> RunSpec:
|
|
604
|
+
scenario_spec = ScenarioSpec(
|
|
605
|
+
class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario",
|
|
606
|
+
args={"data_path": data_path},
|
|
607
|
+
)
|
|
577
608
|
|
|
578
609
|
adapter_spec = get_generation_adapter_spec(
|
|
579
610
|
instructions=("Summarize the clinical note into a brief hospital course."),
|
|
@@ -585,23 +616,29 @@ def get_mimic_bhc_spec() -> RunSpec:
|
|
|
585
616
|
max_train_instances=0,
|
|
586
617
|
stop_sequences=[],
|
|
587
618
|
)
|
|
619
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.mimic_bhc_annotator.MIMICBHCAnnotator")]
|
|
620
|
+
|
|
588
621
|
metric_args = {
|
|
589
622
|
"task": "mimic_bhc",
|
|
590
623
|
"device": get_torch_device_name(),
|
|
591
624
|
"bertscore_model": "distilbert-base-uncased",
|
|
592
625
|
"rescale_with_baseline": False,
|
|
593
626
|
}
|
|
627
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
628
|
+
MetricSpec(class_name="helm.benchmark.metrics.mimic_bhc_metrics.MIMICBHCMetric", args={})
|
|
629
|
+
]
|
|
594
630
|
return RunSpec(
|
|
595
631
|
name="mimic_bhc",
|
|
632
|
+
annotators=annotator_specs,
|
|
596
633
|
scenario_spec=scenario_spec,
|
|
597
634
|
adapter_spec=adapter_spec,
|
|
598
|
-
metric_specs=
|
|
635
|
+
metric_specs=metric_specs,
|
|
599
636
|
groups=["mimic_bhc"],
|
|
600
637
|
)
|
|
601
638
|
|
|
602
639
|
|
|
603
640
|
@run_spec_function("chw_care_plan")
|
|
604
|
-
def get_chw_care_plan_run_spec() -> RunSpec:
|
|
641
|
+
def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
|
|
605
642
|
"""
|
|
606
643
|
RunSpec for the chw_care_plan dataset.
|
|
607
644
|
This configuration evaluates the model's ability to summarize
|
|
@@ -609,7 +646,7 @@ def get_chw_care_plan_run_spec() -> RunSpec:
|
|
|
609
646
|
"""
|
|
610
647
|
scenario_spec = ScenarioSpec(
|
|
611
648
|
class_name="helm.benchmark.scenarios.chw_care_plan_scenario.CHWCarePlanScenario",
|
|
612
|
-
args={},
|
|
649
|
+
args={"data_path": data_path},
|
|
613
650
|
)
|
|
614
651
|
|
|
615
652
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -681,10 +718,10 @@ def get_medication_qa_spec() -> RunSpec:
|
|
|
681
718
|
|
|
682
719
|
|
|
683
720
|
@run_spec_function("starr_patient_instructions")
|
|
684
|
-
def get_starr_patient_instructions_run_spec() -> RunSpec:
|
|
721
|
+
def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
|
|
685
722
|
scenario_spec = ScenarioSpec(
|
|
686
723
|
class_name="helm.benchmark.scenarios.starr_patient_instructions_scenario.StarrPatientInstructionsScenario",
|
|
687
|
-
args={},
|
|
724
|
+
args={"data_path": data_path},
|
|
688
725
|
)
|
|
689
726
|
|
|
690
727
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -748,6 +785,7 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
|
748
785
|
output_noun="Summary",
|
|
749
786
|
max_tokens=80,
|
|
750
787
|
max_train_instances=0,
|
|
788
|
+
stop_sequences=[],
|
|
751
789
|
)
|
|
752
790
|
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator")]
|
|
753
791
|
|
|
@@ -771,8 +809,11 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
|
771
809
|
|
|
772
810
|
|
|
773
811
|
@run_spec_function("shc_conf_med")
|
|
774
|
-
def get_shc_conf_spec() -> RunSpec:
|
|
775
|
-
scenario_spec = ScenarioSpec(
|
|
812
|
+
def get_shc_conf_spec(data_path: str) -> RunSpec:
|
|
813
|
+
scenario_spec = ScenarioSpec(
|
|
814
|
+
class_name="helm.benchmark.scenarios.shc_conf_scenario.SHCCONFMedScenario",
|
|
815
|
+
args={"data_path": data_path},
|
|
816
|
+
)
|
|
776
817
|
|
|
777
818
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
778
819
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -824,13 +865,16 @@ def get_medi_qa_spec() -> RunSpec:
|
|
|
824
865
|
|
|
825
866
|
|
|
826
867
|
@run_spec_function("mental_health")
|
|
827
|
-
def get_mental_health_spec() -> RunSpec:
|
|
868
|
+
def get_mental_health_spec(data_path: str) -> RunSpec:
|
|
828
869
|
"""
|
|
829
870
|
Returns the run specification for the mental health counseling scenario.
|
|
830
871
|
This scenario evaluates a model's ability to generate appropriate counseling responses
|
|
831
872
|
in mental health conversations.
|
|
832
873
|
"""
|
|
833
|
-
scenario_spec = ScenarioSpec(
|
|
874
|
+
scenario_spec = ScenarioSpec(
|
|
875
|
+
class_name="helm.benchmark.scenarios.mental_health_scenario.MentalHealthScenario",
|
|
876
|
+
args={"data_path": data_path},
|
|
877
|
+
)
|
|
834
878
|
|
|
835
879
|
adapter_spec = get_generation_adapter_spec(
|
|
836
880
|
instructions=(
|
|
@@ -840,6 +884,7 @@ def get_mental_health_spec() -> RunSpec:
|
|
|
840
884
|
newline_after_input_noun=False,
|
|
841
885
|
output_noun="Counselor response",
|
|
842
886
|
max_tokens=512,
|
|
887
|
+
stop_sequences=[],
|
|
843
888
|
)
|
|
844
889
|
annotator_specs = [
|
|
845
890
|
AnnotatorSpec(class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator")
|
|
@@ -871,7 +916,11 @@ def get_pubmed_qa_spec() -> RunSpec:
|
|
|
871
916
|
|
|
872
917
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
873
918
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
874
|
-
instructions=
|
|
919
|
+
instructions=(
|
|
920
|
+
"Answer A for yes, B for no or C for maybe. "
|
|
921
|
+
"Do not include any explanation or additional text. "
|
|
922
|
+
"Output only the letter on a single line."
|
|
923
|
+
),
|
|
875
924
|
input_noun="Question",
|
|
876
925
|
output_noun="Answer",
|
|
877
926
|
max_train_instances=0,
|
|
@@ -937,8 +986,11 @@ def get_ehr_sql_run_spec() -> RunSpec:
|
|
|
937
986
|
|
|
938
987
|
|
|
939
988
|
@run_spec_function("shc_bmt_med")
|
|
940
|
-
def get_shc_bmt_spec() -> RunSpec:
|
|
941
|
-
scenario_spec = ScenarioSpec(
|
|
989
|
+
def get_shc_bmt_spec(data_path: str) -> RunSpec:
|
|
990
|
+
scenario_spec = ScenarioSpec(
|
|
991
|
+
class_name="helm.benchmark.scenarios.shc_bmt_scenario.SHCBMTMedScenario",
|
|
992
|
+
args={"data_path": data_path},
|
|
993
|
+
)
|
|
942
994
|
|
|
943
995
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
944
996
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -1002,6 +1054,7 @@ No letter or word, just the integer value.
|
|
|
1002
1054
|
Your Judgment""" # noqa: E501
|
|
1003
1055
|
),
|
|
1004
1056
|
max_train_instances=0,
|
|
1057
|
+
stop_sequences=[],
|
|
1005
1058
|
)
|
|
1006
1059
|
|
|
1007
1060
|
return RunSpec(
|
|
@@ -1014,17 +1067,17 @@ Your Judgment""" # noqa: E501
|
|
|
1014
1067
|
|
|
1015
1068
|
|
|
1016
1069
|
@run_spec_function("n2c2_ct_matching")
|
|
1017
|
-
def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
|
|
1070
|
+
def get_n2c2_ct_matching_spec(data_path: str, subject: str) -> RunSpec:
|
|
1018
1071
|
scenario_spec = ScenarioSpec(
|
|
1019
1072
|
class_name="helm.benchmark.scenarios.n2c2_ct_matching_scenario.N2C2CTMatchingScenario",
|
|
1020
|
-
args={"subject": subject},
|
|
1073
|
+
args={"data_path": data_path, "subject": subject},
|
|
1021
1074
|
)
|
|
1022
1075
|
|
|
1023
1076
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1024
1077
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1025
1078
|
instructions="Answer A for yes, B for no.",
|
|
1026
1079
|
input_noun="",
|
|
1027
|
-
output_noun="Answer A for yes, B for no",
|
|
1080
|
+
output_noun="Answer A for yes, B for no. Do not add any other text, punctuation, or symbols",
|
|
1028
1081
|
max_train_instances=0,
|
|
1029
1082
|
)
|
|
1030
1083
|
|
|
@@ -1038,8 +1091,10 @@ def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
|
|
|
1038
1091
|
|
|
1039
1092
|
|
|
1040
1093
|
@run_spec_function("shc_gip_med")
|
|
1041
|
-
def get_shc_gip_spec() -> RunSpec:
|
|
1042
|
-
scenario_spec = ScenarioSpec(
|
|
1094
|
+
def get_shc_gip_spec(data_path: str) -> RunSpec:
|
|
1095
|
+
scenario_spec = ScenarioSpec(
|
|
1096
|
+
class_name="helm.benchmark.scenarios.shc_gip_scenario.SHCGIPMedScenario", args={"data_path": data_path}
|
|
1097
|
+
)
|
|
1043
1098
|
|
|
1044
1099
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1045
1100
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -1058,11 +1113,11 @@ def get_shc_gip_spec() -> RunSpec:
|
|
|
1058
1113
|
|
|
1059
1114
|
|
|
1060
1115
|
@run_spec_function("mimiciv_billing_code")
|
|
1061
|
-
def get_mimiciv_billing_code_spec() -> RunSpec:
|
|
1116
|
+
def get_mimiciv_billing_code_spec(data_path: str) -> RunSpec:
|
|
1062
1117
|
scenario_spec = ScenarioSpec(
|
|
1063
1118
|
class_name="helm.benchmark.scenarios.mimiciv_billing_code_scenario.MIMICIVBillingCodeScenario",
|
|
1064
1119
|
args={
|
|
1065
|
-
"
|
|
1120
|
+
"data_path": data_path,
|
|
1066
1121
|
},
|
|
1067
1122
|
)
|
|
1068
1123
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -1094,9 +1149,9 @@ def get_mimiciv_billing_code_spec() -> RunSpec:
|
|
|
1094
1149
|
|
|
1095
1150
|
|
|
1096
1151
|
@run_spec_function("shc_sequoia_med")
|
|
1097
|
-
def get_shc_sequoia_spec() -> RunSpec:
|
|
1152
|
+
def get_shc_sequoia_spec(data_path: str) -> RunSpec:
|
|
1098
1153
|
scenario_spec = ScenarioSpec(
|
|
1099
|
-
class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={}
|
|
1154
|
+
class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={"data_path": data_path}
|
|
1100
1155
|
)
|
|
1101
1156
|
|
|
1102
1157
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
@@ -1116,8 +1171,10 @@ def get_shc_sequoia_spec() -> RunSpec:
|
|
|
1116
1171
|
|
|
1117
1172
|
|
|
1118
1173
|
@run_spec_function("shc_cdi_med")
|
|
1119
|
-
def get_shc_cdi_spec() -> RunSpec:
|
|
1120
|
-
scenario_spec = ScenarioSpec(
|
|
1174
|
+
def get_shc_cdi_spec(data_path: str) -> RunSpec:
|
|
1175
|
+
scenario_spec = ScenarioSpec(
|
|
1176
|
+
class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCCDIMedScenario", args={"data_path": data_path}
|
|
1177
|
+
)
|
|
1121
1178
|
|
|
1122
1179
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1123
1180
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -1136,8 +1193,10 @@ def get_shc_cdi_spec() -> RunSpec:
|
|
|
1136
1193
|
|
|
1137
1194
|
|
|
1138
1195
|
@run_spec_function("shc_ent_med")
|
|
1139
|
-
def get_shc_ent_spec() -> RunSpec:
|
|
1140
|
-
scenario_spec = ScenarioSpec(
|
|
1196
|
+
def get_shc_ent_spec(data_path: str) -> RunSpec:
|
|
1197
|
+
scenario_spec = ScenarioSpec(
|
|
1198
|
+
class_name="helm.benchmark.scenarios.shc_ent_scenario.SHCENTMedScenario", args={"data_path": data_path}
|
|
1199
|
+
)
|
|
1141
1200
|
|
|
1142
1201
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1143
1202
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -1153,3 +1212,49 @@ def get_shc_ent_spec() -> RunSpec:
|
|
|
1153
1212
|
metric_specs=get_exact_match_metric_specs(),
|
|
1154
1213
|
groups=["shc_ent_med"],
|
|
1155
1214
|
)
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
@run_spec_function("shc_privacy_med")
|
|
1218
|
+
def get_shc_privacy_spec(data_path: str) -> RunSpec:
|
|
1219
|
+
scenario_spec = ScenarioSpec(
|
|
1220
|
+
class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPRIVACYMedScenario",
|
|
1221
|
+
args={"data_path": data_path},
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1225
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1226
|
+
instructions="Answer A or B.",
|
|
1227
|
+
input_noun="",
|
|
1228
|
+
output_noun="",
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
return RunSpec(
|
|
1232
|
+
name="shc_privacy_med",
|
|
1233
|
+
scenario_spec=scenario_spec,
|
|
1234
|
+
adapter_spec=adapter_spec,
|
|
1235
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1236
|
+
groups=["shc_privacy_med"],
|
|
1237
|
+
)
|
|
1238
|
+
|
|
1239
|
+
|
|
1240
|
+
@run_spec_function("shc_proxy_med")
|
|
1241
|
+
def get_shc_proxy_spec(data_path: str) -> RunSpec:
|
|
1242
|
+
scenario_spec = ScenarioSpec(
|
|
1243
|
+
class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPROXYMedScenario",
|
|
1244
|
+
args={"data_path": data_path},
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1248
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
1249
|
+
instructions="Answer A or B.",
|
|
1250
|
+
input_noun="",
|
|
1251
|
+
output_noun="",
|
|
1252
|
+
)
|
|
1253
|
+
|
|
1254
|
+
return RunSpec(
|
|
1255
|
+
name="shc_proxy_med",
|
|
1256
|
+
scenario_spec=scenario_spec,
|
|
1257
|
+
adapter_spec=adapter_spec,
|
|
1258
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1259
|
+
groups=["shc_proxy_med"],
|
|
1260
|
+
)
|