crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -3,7 +3,12 @@
|
|
|
3
3
|
Website: https://crfm.stanford.edu/helm/medhelm/
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
import importlib.resources as pkg_resources
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Dict, Union, Optional
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
7
12
|
|
|
8
13
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
9
14
|
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -13,6 +18,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
|
|
|
13
18
|
get_multiple_choice_adapter_spec,
|
|
14
19
|
)
|
|
15
20
|
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
21
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
16
22
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
17
23
|
get_basic_metric_specs,
|
|
18
24
|
get_exact_match_metric_specs,
|
|
@@ -22,10 +28,69 @@ from helm.benchmark.metrics.common_metric_specs import (
|
|
|
22
28
|
)
|
|
23
29
|
from helm.benchmark.metrics.metric import MetricSpec
|
|
24
30
|
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
31
|
+
from helm.benchmark.run_specs.medhelm.benchmark_config import get_benchmark_config_from_path
|
|
25
32
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
26
33
|
from helm.common.gpu_utils import get_torch_device_name
|
|
27
34
|
|
|
28
35
|
|
|
36
|
+
def get_judges_config(jury_config_path: Optional[str]) -> dict:
|
|
37
|
+
package = "helm.benchmark.scenarios.medhelm"
|
|
38
|
+
default_config_path = str(pkg_resources.files(package).joinpath("judges.yaml"))
|
|
39
|
+
|
|
40
|
+
if jury_config_path is None:
|
|
41
|
+
# Use the default config bundled with the package
|
|
42
|
+
jury_config_path = default_config_path
|
|
43
|
+
|
|
44
|
+
assert os.path.exists(jury_config_path), (
|
|
45
|
+
f"Judges config file not found: {jury_config_path}. "
|
|
46
|
+
f"If you are providing a custom config, make sure it follows the format specified in "
|
|
47
|
+
f"the default file: {default_config_path}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
with open(jury_config_path, "r") as f:
|
|
51
|
+
config = yaml.safe_load(f)
|
|
52
|
+
|
|
53
|
+
return config
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_annotator_models_from_config(jury_config_path: Optional[str]) -> Dict[str, AnnotatorModelInfo]:
|
|
57
|
+
config = get_judges_config(jury_config_path)
|
|
58
|
+
annotator_models = {
|
|
59
|
+
judge["name"]: AnnotatorModelInfo(
|
|
60
|
+
model_name=judge["model"],
|
|
61
|
+
model_deployment=judge["model_deployment"],
|
|
62
|
+
)
|
|
63
|
+
for judge in config["judges"]
|
|
64
|
+
}
|
|
65
|
+
return annotator_models
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@run_spec_function("medhelm_configurable_benchmark")
|
|
69
|
+
def get_medhelm_configurable_benchmark_spec(config_path: str) -> RunSpec:
|
|
70
|
+
benchmark_config = get_benchmark_config_from_path(config_path)
|
|
71
|
+
scenario_spec = ScenarioSpec(
|
|
72
|
+
class_name="helm.benchmark.scenarios.medhelm_configurable_scenario.MedHELMConfigurableScenario",
|
|
73
|
+
args={"name": benchmark_config.name, "config_path": config_path},
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
adapter_spec = get_generation_adapter_spec(
|
|
77
|
+
max_tokens=benchmark_config.max_tokens,
|
|
78
|
+
max_train_instances=0,
|
|
79
|
+
stop_sequences=[],
|
|
80
|
+
)
|
|
81
|
+
annotator_specs = benchmark_config.get_annotator_specs()
|
|
82
|
+
metric_specs = benchmark_config.get_metric_specs()
|
|
83
|
+
|
|
84
|
+
return RunSpec(
|
|
85
|
+
name=benchmark_config.name,
|
|
86
|
+
scenario_spec=scenario_spec,
|
|
87
|
+
adapter_spec=adapter_spec,
|
|
88
|
+
annotators=annotator_specs,
|
|
89
|
+
metric_specs=metric_specs,
|
|
90
|
+
groups=[benchmark_config.name],
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
29
94
|
@run_spec_function("medcalc_bench")
|
|
30
95
|
def get_medcalc_bench_spec() -> RunSpec:
|
|
31
96
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario")
|
|
@@ -91,7 +156,7 @@ def get_clear_spec(condition: str, data_path: str) -> RunSpec:
|
|
|
91
156
|
|
|
92
157
|
|
|
93
158
|
@run_spec_function("mtsamples_replicate")
|
|
94
|
-
def get_mtsamples_spec() -> RunSpec:
|
|
159
|
+
def get_mtsamples_spec(jury_config_path: Optional[str] = None) -> RunSpec:
|
|
95
160
|
scenario_spec = ScenarioSpec(
|
|
96
161
|
class_name="helm.benchmark.scenarios.mtsamples_replicate_scenario.MTSamplesReplicateScenario"
|
|
97
162
|
)
|
|
@@ -106,8 +171,15 @@ def get_mtsamples_spec() -> RunSpec:
|
|
|
106
171
|
stop_sequences=[],
|
|
107
172
|
)
|
|
108
173
|
|
|
174
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
175
|
+
|
|
109
176
|
annotator_specs = [
|
|
110
|
-
AnnotatorSpec(
|
|
177
|
+
AnnotatorSpec(
|
|
178
|
+
class_name="helm.benchmark.annotation.mtsamples_replicate_annotator.MTSamplesReplicateAnnotator",
|
|
179
|
+
args={
|
|
180
|
+
"annotator_models": annotator_models,
|
|
181
|
+
},
|
|
182
|
+
)
|
|
111
183
|
]
|
|
112
184
|
|
|
113
185
|
metric_args = {
|
|
@@ -118,7 +190,15 @@ def get_mtsamples_spec() -> RunSpec:
|
|
|
118
190
|
}
|
|
119
191
|
|
|
120
192
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
121
|
-
MetricSpec(
|
|
193
|
+
MetricSpec(
|
|
194
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
195
|
+
args={
|
|
196
|
+
"metric_name": "mtsamples_replicate_accuracy",
|
|
197
|
+
"scenario_name": "mtsamples_replicate",
|
|
198
|
+
"annotator_models": annotator_models,
|
|
199
|
+
"default_score": 1.0,
|
|
200
|
+
},
|
|
201
|
+
)
|
|
122
202
|
]
|
|
123
203
|
|
|
124
204
|
return RunSpec(
|
|
@@ -293,6 +373,50 @@ def get_medbullets_run_spec() -> RunSpec:
|
|
|
293
373
|
)
|
|
294
374
|
|
|
295
375
|
|
|
376
|
+
@run_spec_function("medhelm_med_qa")
|
|
377
|
+
def get_medhelm_med_qa_spec() -> RunSpec:
|
|
378
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
|
|
379
|
+
|
|
380
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
381
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
382
|
+
instructions="Give a letter answer among A, B, C or D. Do not include any explanation or additional text.",
|
|
383
|
+
input_noun="Question",
|
|
384
|
+
output_noun="Respond only with 'A', 'B', 'C' or 'D'. Do not add any other text, punctuation, or symbols.",
|
|
385
|
+
max_tokens=1,
|
|
386
|
+
max_train_instances=0,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return RunSpec(
|
|
390
|
+
name="med_qa",
|
|
391
|
+
scenario_spec=scenario_spec,
|
|
392
|
+
adapter_spec=adapter_spec,
|
|
393
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
394
|
+
groups=["med_qa"],
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
@run_spec_function("medhelm_med_mcqa")
|
|
399
|
+
def get_medhelm_med_mcqa_spec() -> RunSpec:
|
|
400
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
|
|
401
|
+
|
|
402
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
403
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
404
|
+
instructions="Give a letter answer among A, B, C or D. Do not include any explanation or additional text.",
|
|
405
|
+
input_noun="Question",
|
|
406
|
+
output_noun="Respond only with 'A', 'B', 'C' or 'D'. Do not add any other text, punctuation, or symbols.",
|
|
407
|
+
max_tokens=1,
|
|
408
|
+
max_train_instances=0,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
return RunSpec(
|
|
412
|
+
name="med_mcqa",
|
|
413
|
+
scenario_spec=scenario_spec,
|
|
414
|
+
adapter_spec=adapter_spec,
|
|
415
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
416
|
+
groups=["med_mcqa"],
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
|
|
296
420
|
@run_spec_function("medbullets_freetext")
|
|
297
421
|
def get_medbullets_freetext_run_spec() -> RunSpec:
|
|
298
422
|
"""RunSpec for the MedBullets Free-text dataset."""
|
|
@@ -330,7 +454,7 @@ def get_medbullets_freetext_run_spec() -> RunSpec:
|
|
|
330
454
|
|
|
331
455
|
|
|
332
456
|
@run_spec_function("medalign")
|
|
333
|
-
def get_medalign_spec(data_path: str, max_length: int =
|
|
457
|
+
def get_medalign_spec(data_path: str, jury_config_path: Optional[str] = None, max_length: int = 100000) -> RunSpec:
|
|
334
458
|
scenario_spec = ScenarioSpec(
|
|
335
459
|
class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario",
|
|
336
460
|
args={
|
|
@@ -349,7 +473,16 @@ def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
|
|
|
349
473
|
max_train_instances=0,
|
|
350
474
|
)
|
|
351
475
|
|
|
352
|
-
|
|
476
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
477
|
+
|
|
478
|
+
annotator_specs = [
|
|
479
|
+
AnnotatorSpec(
|
|
480
|
+
class_name="helm.benchmark.annotation.medalign_annotator.MedalignAnnotator",
|
|
481
|
+
args={
|
|
482
|
+
"annotator_models": annotator_models,
|
|
483
|
+
},
|
|
484
|
+
)
|
|
485
|
+
]
|
|
353
486
|
|
|
354
487
|
metric_args = {
|
|
355
488
|
"task": "medalign",
|
|
@@ -358,7 +491,15 @@ def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
|
|
|
358
491
|
"rescale_with_baseline": False,
|
|
359
492
|
}
|
|
360
493
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
361
|
-
MetricSpec(
|
|
494
|
+
MetricSpec(
|
|
495
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
496
|
+
args={
|
|
497
|
+
"metric_name": "medalign_accuracy",
|
|
498
|
+
"scenario_name": "medalign",
|
|
499
|
+
"annotator_models": annotator_models,
|
|
500
|
+
"default_score": 1.0,
|
|
501
|
+
},
|
|
502
|
+
)
|
|
362
503
|
]
|
|
363
504
|
|
|
364
505
|
return RunSpec(
|
|
@@ -418,7 +559,7 @@ def get_shc_sei_spec(data_path: str) -> RunSpec:
|
|
|
418
559
|
|
|
419
560
|
|
|
420
561
|
@run_spec_function("dischargeme")
|
|
421
|
-
def get_dischargeme_spec(data_path: str) -> RunSpec:
|
|
562
|
+
def get_dischargeme_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
422
563
|
scenario_spec = ScenarioSpec(
|
|
423
564
|
class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario",
|
|
424
565
|
args={
|
|
@@ -440,7 +581,16 @@ def get_dischargeme_spec(data_path: str) -> RunSpec:
|
|
|
440
581
|
max_train_instances=0,
|
|
441
582
|
)
|
|
442
583
|
|
|
443
|
-
|
|
584
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
585
|
+
|
|
586
|
+
annotator_specs = [
|
|
587
|
+
AnnotatorSpec(
|
|
588
|
+
class_name="helm.benchmark.annotation.dischargeme_annotator.DischargeMeAnnotator",
|
|
589
|
+
args={
|
|
590
|
+
"annotator_models": annotator_models,
|
|
591
|
+
},
|
|
592
|
+
)
|
|
593
|
+
]
|
|
444
594
|
|
|
445
595
|
metric_args = {
|
|
446
596
|
"task": "dischargeme",
|
|
@@ -449,7 +599,15 @@ def get_dischargeme_spec(data_path: str) -> RunSpec:
|
|
|
449
599
|
"rescale_with_baseline": False,
|
|
450
600
|
}
|
|
451
601
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
452
|
-
MetricSpec(
|
|
602
|
+
MetricSpec(
|
|
603
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
604
|
+
args={
|
|
605
|
+
"metric_name": "dischargeme_accuracy",
|
|
606
|
+
"scenario_name": "dischargeme",
|
|
607
|
+
"annotator_models": annotator_models,
|
|
608
|
+
"default_score": 1.0,
|
|
609
|
+
},
|
|
610
|
+
)
|
|
453
611
|
]
|
|
454
612
|
return RunSpec(
|
|
455
613
|
name="dischargeme",
|
|
@@ -462,13 +620,12 @@ def get_dischargeme_spec(data_path: str) -> RunSpec:
|
|
|
462
620
|
|
|
463
621
|
|
|
464
622
|
@run_spec_function("aci_bench")
|
|
465
|
-
def get_aci_bench_run_spec() -> RunSpec:
|
|
623
|
+
def get_aci_bench_run_spec(jury_config_path: Optional[str] = None) -> RunSpec:
|
|
466
624
|
"""
|
|
467
625
|
RunSpec for the ACI-Bench dataset.
|
|
468
626
|
This configuration evaluates the model's ability to summarize
|
|
469
627
|
doctor-patient dialogues into structured clinical notes.
|
|
470
628
|
"""
|
|
471
|
-
# Define the scenario
|
|
472
629
|
scenario_spec = ScenarioSpec(
|
|
473
630
|
class_name="helm.benchmark.scenarios.aci_bench_scenario.ACIBenchScenario",
|
|
474
631
|
args={},
|
|
@@ -491,7 +648,16 @@ def get_aci_bench_run_spec() -> RunSpec:
|
|
|
491
648
|
stop_sequences=[],
|
|
492
649
|
)
|
|
493
650
|
|
|
494
|
-
|
|
651
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
652
|
+
|
|
653
|
+
annotator_specs = [
|
|
654
|
+
AnnotatorSpec(
|
|
655
|
+
class_name="helm.benchmark.annotation.aci_bench_annotator.ACIBenchAnnotator",
|
|
656
|
+
args={
|
|
657
|
+
"annotator_models": annotator_models,
|
|
658
|
+
},
|
|
659
|
+
)
|
|
660
|
+
]
|
|
495
661
|
|
|
496
662
|
# Define the metrics
|
|
497
663
|
metric_args = {
|
|
@@ -501,7 +667,15 @@ def get_aci_bench_run_spec() -> RunSpec:
|
|
|
501
667
|
"rescale_with_baseline": False,
|
|
502
668
|
}
|
|
503
669
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
504
|
-
MetricSpec(
|
|
670
|
+
MetricSpec(
|
|
671
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
672
|
+
args={
|
|
673
|
+
"metric_name": "aci_bench_accuracy",
|
|
674
|
+
"scenario_name": "aci_bench",
|
|
675
|
+
"annotator_models": annotator_models,
|
|
676
|
+
"default_score": 1.0,
|
|
677
|
+
},
|
|
678
|
+
)
|
|
505
679
|
]
|
|
506
680
|
|
|
507
681
|
# Return the RunSpec
|
|
@@ -516,7 +690,7 @@ def get_aci_bench_run_spec() -> RunSpec:
|
|
|
516
690
|
|
|
517
691
|
|
|
518
692
|
@run_spec_function("mtsamples_procedures")
|
|
519
|
-
def get_mtsamples_procedures_spec() -> RunSpec:
|
|
693
|
+
def get_mtsamples_procedures_spec(jury_config_path: Optional[str] = None) -> RunSpec:
|
|
520
694
|
scenario_spec = ScenarioSpec(
|
|
521
695
|
class_name="helm.benchmark.scenarios.mtsamples_procedures_scenario.MTSamplesProceduresScenario"
|
|
522
696
|
)
|
|
@@ -530,10 +704,14 @@ def get_mtsamples_procedures_spec() -> RunSpec:
|
|
|
530
704
|
max_train_instances=0,
|
|
531
705
|
stop_sequences=[],
|
|
532
706
|
)
|
|
707
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
533
708
|
|
|
534
709
|
annotator_specs = [
|
|
535
710
|
AnnotatorSpec(
|
|
536
|
-
class_name="helm.benchmark.annotation.mtsamples_procedures_annotator.MTSamplesProceduresAnnotator"
|
|
711
|
+
class_name="helm.benchmark.annotation.mtsamples_procedures_annotator.MTSamplesProceduresAnnotator",
|
|
712
|
+
args={
|
|
713
|
+
"annotator_models": annotator_models,
|
|
714
|
+
},
|
|
537
715
|
)
|
|
538
716
|
]
|
|
539
717
|
|
|
@@ -545,7 +723,15 @@ def get_mtsamples_procedures_spec() -> RunSpec:
|
|
|
545
723
|
}
|
|
546
724
|
|
|
547
725
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
548
|
-
MetricSpec(
|
|
726
|
+
MetricSpec(
|
|
727
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
728
|
+
args={
|
|
729
|
+
"metric_name": "mtsamples_procedures_accuracy",
|
|
730
|
+
"scenario_name": "mtsamples_procedures",
|
|
731
|
+
"annotator_models": annotator_models,
|
|
732
|
+
"default_score": 1.0,
|
|
733
|
+
},
|
|
734
|
+
)
|
|
549
735
|
]
|
|
550
736
|
|
|
551
737
|
return RunSpec(
|
|
@@ -559,7 +745,7 @@ def get_mtsamples_procedures_spec() -> RunSpec:
|
|
|
559
745
|
|
|
560
746
|
|
|
561
747
|
@run_spec_function("mimic_rrs")
|
|
562
|
-
def get_mimic_rrs_spec(data_path: str) -> RunSpec:
|
|
748
|
+
def get_mimic_rrs_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
563
749
|
scenario_spec = ScenarioSpec(
|
|
564
750
|
class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario",
|
|
565
751
|
args={"data_path": data_path},
|
|
@@ -578,7 +764,17 @@ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
|
|
|
578
764
|
max_train_instances=0,
|
|
579
765
|
stop_sequences=[],
|
|
580
766
|
)
|
|
581
|
-
|
|
767
|
+
|
|
768
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
769
|
+
|
|
770
|
+
annotator_specs = [
|
|
771
|
+
AnnotatorSpec(
|
|
772
|
+
class_name="helm.benchmark.annotation.mimic_rrs_annotator.MIMICRRSAnnotator",
|
|
773
|
+
args={
|
|
774
|
+
"annotator_models": annotator_models,
|
|
775
|
+
},
|
|
776
|
+
)
|
|
777
|
+
]
|
|
582
778
|
|
|
583
779
|
metric_args = {
|
|
584
780
|
"task": "mimic_rrs",
|
|
@@ -587,7 +783,15 @@ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
|
|
|
587
783
|
"rescale_with_baseline": False,
|
|
588
784
|
}
|
|
589
785
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
590
|
-
MetricSpec(
|
|
786
|
+
MetricSpec(
|
|
787
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
788
|
+
args={
|
|
789
|
+
"metric_name": "mimic_rrs_accuracy",
|
|
790
|
+
"scenario_name": "mimic_rrs",
|
|
791
|
+
"annotator_models": annotator_models,
|
|
792
|
+
"default_score": 1.0,
|
|
793
|
+
},
|
|
794
|
+
)
|
|
591
795
|
]
|
|
592
796
|
return RunSpec(
|
|
593
797
|
name="mimic_rrs",
|
|
@@ -600,7 +804,7 @@ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
|
|
|
600
804
|
|
|
601
805
|
|
|
602
806
|
@run_spec_function("mimic_bhc")
|
|
603
|
-
def get_mimic_bhc_spec(data_path: str) -> RunSpec:
|
|
807
|
+
def get_mimic_bhc_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
604
808
|
scenario_spec = ScenarioSpec(
|
|
605
809
|
class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario",
|
|
606
810
|
args={"data_path": data_path},
|
|
@@ -616,7 +820,17 @@ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
|
|
|
616
820
|
max_train_instances=0,
|
|
617
821
|
stop_sequences=[],
|
|
618
822
|
)
|
|
619
|
-
|
|
823
|
+
|
|
824
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
825
|
+
|
|
826
|
+
annotator_specs = [
|
|
827
|
+
AnnotatorSpec(
|
|
828
|
+
class_name="helm.benchmark.annotation.mimic_bhc_annotator.MIMICBHCAnnotator",
|
|
829
|
+
args={
|
|
830
|
+
"annotator_models": annotator_models,
|
|
831
|
+
},
|
|
832
|
+
)
|
|
833
|
+
]
|
|
620
834
|
|
|
621
835
|
metric_args = {
|
|
622
836
|
"task": "mimic_bhc",
|
|
@@ -625,7 +839,15 @@ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
|
|
|
625
839
|
"rescale_with_baseline": False,
|
|
626
840
|
}
|
|
627
841
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
628
|
-
MetricSpec(
|
|
842
|
+
MetricSpec(
|
|
843
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
844
|
+
args={
|
|
845
|
+
"metric_name": "mimic_bhc_accuracy",
|
|
846
|
+
"scenario_name": "mimic_bhc",
|
|
847
|
+
"annotator_models": annotator_models,
|
|
848
|
+
"default_score": 1.0,
|
|
849
|
+
},
|
|
850
|
+
)
|
|
629
851
|
]
|
|
630
852
|
return RunSpec(
|
|
631
853
|
name="mimic_bhc",
|
|
@@ -638,7 +860,7 @@ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
|
|
|
638
860
|
|
|
639
861
|
|
|
640
862
|
@run_spec_function("chw_care_plan")
|
|
641
|
-
def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
|
|
863
|
+
def get_chw_care_plan_run_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
642
864
|
"""
|
|
643
865
|
RunSpec for the chw_care_plan dataset.
|
|
644
866
|
This configuration evaluates the model's ability to summarize
|
|
@@ -659,8 +881,16 @@ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
|
|
|
659
881
|
max_train_instances=0,
|
|
660
882
|
stop_sequences=[],
|
|
661
883
|
)
|
|
884
|
+
|
|
885
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
886
|
+
|
|
662
887
|
annotator_specs = [
|
|
663
|
-
AnnotatorSpec(
|
|
888
|
+
AnnotatorSpec(
|
|
889
|
+
class_name="helm.benchmark.annotation.chw_care_plan_annotator.CHWCarePlanAnnotator",
|
|
890
|
+
args={
|
|
891
|
+
"annotator_models": annotator_models,
|
|
892
|
+
},
|
|
893
|
+
)
|
|
664
894
|
]
|
|
665
895
|
|
|
666
896
|
metric_args = {
|
|
@@ -670,7 +900,15 @@ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
|
|
|
670
900
|
"rescale_with_baseline": False,
|
|
671
901
|
}
|
|
672
902
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
673
|
-
MetricSpec(
|
|
903
|
+
MetricSpec(
|
|
904
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
905
|
+
args={
|
|
906
|
+
"metric_name": "chw_care_plan_accuracy",
|
|
907
|
+
"scenario_name": "chw_care_plan",
|
|
908
|
+
"annotator_models": annotator_models,
|
|
909
|
+
"default_score": 1.0,
|
|
910
|
+
},
|
|
911
|
+
)
|
|
674
912
|
]
|
|
675
913
|
# Return the RunSpec
|
|
676
914
|
return RunSpec(
|
|
@@ -684,7 +922,7 @@ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
|
|
|
684
922
|
|
|
685
923
|
|
|
686
924
|
@run_spec_function("medication_qa")
|
|
687
|
-
def get_medication_qa_spec() -> RunSpec:
|
|
925
|
+
def get_medication_qa_spec(jury_config_path: Optional[str] = None) -> RunSpec:
|
|
688
926
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
|
|
689
927
|
|
|
690
928
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -695,8 +933,15 @@ def get_medication_qa_spec() -> RunSpec:
|
|
|
695
933
|
max_tokens=512,
|
|
696
934
|
stop_sequences=[],
|
|
697
935
|
)
|
|
936
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
937
|
+
|
|
698
938
|
annotator_specs = [
|
|
699
|
-
AnnotatorSpec(
|
|
939
|
+
AnnotatorSpec(
|
|
940
|
+
class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator",
|
|
941
|
+
args={
|
|
942
|
+
"annotator_models": annotator_models,
|
|
943
|
+
},
|
|
944
|
+
)
|
|
700
945
|
]
|
|
701
946
|
metric_args = {
|
|
702
947
|
"task": "medication_qa",
|
|
@@ -705,7 +950,15 @@ def get_medication_qa_spec() -> RunSpec:
|
|
|
705
950
|
"rescale_with_baseline": False,
|
|
706
951
|
}
|
|
707
952
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
708
|
-
MetricSpec(
|
|
953
|
+
MetricSpec(
|
|
954
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
955
|
+
args={
|
|
956
|
+
"metric_name": "medication_qa_accuracy",
|
|
957
|
+
"scenario_name": "medication_qa",
|
|
958
|
+
"annotator_models": annotator_models,
|
|
959
|
+
"default_score": 1.0,
|
|
960
|
+
},
|
|
961
|
+
)
|
|
709
962
|
]
|
|
710
963
|
return RunSpec(
|
|
711
964
|
name="medication_qa",
|
|
@@ -718,7 +971,7 @@ def get_medication_qa_spec() -> RunSpec:
|
|
|
718
971
|
|
|
719
972
|
|
|
720
973
|
@run_spec_function("starr_patient_instructions")
|
|
721
|
-
def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
|
|
974
|
+
def get_starr_patient_instructions_run_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
722
975
|
scenario_spec = ScenarioSpec(
|
|
723
976
|
class_name="helm.benchmark.scenarios.starr_patient_instructions_scenario.StarrPatientInstructionsScenario",
|
|
724
977
|
args={"data_path": data_path},
|
|
@@ -739,11 +992,16 @@ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
|
|
|
739
992
|
max_train_instances=0,
|
|
740
993
|
stop_sequences=[],
|
|
741
994
|
)
|
|
995
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
996
|
+
|
|
742
997
|
annotator_specs = [
|
|
743
998
|
AnnotatorSpec(
|
|
744
999
|
class_name=(
|
|
745
1000
|
"helm.benchmark.annotation.starr_patient_instructions_annotator.StarrPatientInstructionsAnnotator"
|
|
746
|
-
)
|
|
1001
|
+
),
|
|
1002
|
+
args={
|
|
1003
|
+
"annotator_models": annotator_models,
|
|
1004
|
+
},
|
|
747
1005
|
)
|
|
748
1006
|
]
|
|
749
1007
|
|
|
@@ -753,16 +1011,17 @@ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
|
|
|
753
1011
|
"bertscore_model": "distilbert-base-uncased",
|
|
754
1012
|
"rescale_with_baseline": False,
|
|
755
1013
|
}
|
|
756
|
-
metric_specs = (
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
1014
|
+
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
1015
|
+
MetricSpec(
|
|
1016
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
1017
|
+
args={
|
|
1018
|
+
"metric_name": "starr_patient_instructions_accuracy",
|
|
1019
|
+
"scenario_name": "starr_patient_instructions",
|
|
1020
|
+
"annotator_models": annotator_models,
|
|
1021
|
+
"default_score": 1.0,
|
|
1022
|
+
},
|
|
1023
|
+
)
|
|
1024
|
+
]
|
|
766
1025
|
return RunSpec(
|
|
767
1026
|
name="starr_patient_instructions",
|
|
768
1027
|
scenario_spec=scenario_spec,
|
|
@@ -774,7 +1033,7 @@ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
|
|
|
774
1033
|
|
|
775
1034
|
|
|
776
1035
|
@run_spec_function("med_dialog")
|
|
777
|
-
def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
1036
|
+
def get_med_dialog_spec(subset: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
778
1037
|
scenario_spec = ScenarioSpec(
|
|
779
1038
|
class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
|
|
780
1039
|
)
|
|
@@ -787,7 +1046,17 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
|
787
1046
|
max_train_instances=0,
|
|
788
1047
|
stop_sequences=[],
|
|
789
1048
|
)
|
|
790
|
-
|
|
1049
|
+
|
|
1050
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
1051
|
+
|
|
1052
|
+
annotator_specs = [
|
|
1053
|
+
AnnotatorSpec(
|
|
1054
|
+
class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator",
|
|
1055
|
+
args={
|
|
1056
|
+
"annotator_models": annotator_models,
|
|
1057
|
+
},
|
|
1058
|
+
)
|
|
1059
|
+
]
|
|
791
1060
|
|
|
792
1061
|
metric_args = {
|
|
793
1062
|
"task": "med_dialog",
|
|
@@ -796,7 +1065,15 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
|
|
|
796
1065
|
"rescale_with_baseline": False,
|
|
797
1066
|
}
|
|
798
1067
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
799
|
-
MetricSpec(
|
|
1068
|
+
MetricSpec(
|
|
1069
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
1070
|
+
args={
|
|
1071
|
+
"metric_name": "med_dialog_accuracy",
|
|
1072
|
+
"scenario_name": "med_dialog",
|
|
1073
|
+
"annotator_models": annotator_models,
|
|
1074
|
+
"default_score": 1.0,
|
|
1075
|
+
},
|
|
1076
|
+
)
|
|
800
1077
|
]
|
|
801
1078
|
return RunSpec(
|
|
802
1079
|
name=f"med_dialog,subset={subset}",
|
|
@@ -832,7 +1109,7 @@ def get_shc_conf_spec(data_path: str) -> RunSpec:
|
|
|
832
1109
|
|
|
833
1110
|
|
|
834
1111
|
@run_spec_function("medi_qa")
|
|
835
|
-
def get_medi_qa_spec() -> RunSpec:
|
|
1112
|
+
def get_medi_qa_spec(jury_config_path: Optional[str] = None) -> RunSpec:
|
|
836
1113
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medi_qa_scenario.MediQAScenario", args={})
|
|
837
1114
|
|
|
838
1115
|
adapter_spec = get_generation_adapter_spec(
|
|
@@ -843,7 +1120,17 @@ def get_medi_qa_spec() -> RunSpec:
|
|
|
843
1120
|
max_train_instances=0,
|
|
844
1121
|
stop_sequences=[],
|
|
845
1122
|
)
|
|
846
|
-
|
|
1123
|
+
|
|
1124
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
1125
|
+
|
|
1126
|
+
annotator_specs = [
|
|
1127
|
+
AnnotatorSpec(
|
|
1128
|
+
class_name="helm.benchmark.annotation.medi_qa_annotator.MediQAAnnotator",
|
|
1129
|
+
args={
|
|
1130
|
+
"annotator_models": annotator_models,
|
|
1131
|
+
},
|
|
1132
|
+
)
|
|
1133
|
+
]
|
|
847
1134
|
|
|
848
1135
|
metric_args = {
|
|
849
1136
|
"task": "medi_qa",
|
|
@@ -852,7 +1139,15 @@ def get_medi_qa_spec() -> RunSpec:
|
|
|
852
1139
|
"rescale_with_baseline": False,
|
|
853
1140
|
}
|
|
854
1141
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
855
|
-
MetricSpec(
|
|
1142
|
+
MetricSpec(
|
|
1143
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
1144
|
+
args={
|
|
1145
|
+
"metric_name": "medi_qa_accuracy",
|
|
1146
|
+
"scenario_name": "medi_qa",
|
|
1147
|
+
"annotator_models": annotator_models,
|
|
1148
|
+
"default_score": 1.0,
|
|
1149
|
+
},
|
|
1150
|
+
)
|
|
856
1151
|
]
|
|
857
1152
|
return RunSpec(
|
|
858
1153
|
name="medi_qa",
|
|
@@ -865,7 +1160,7 @@ def get_medi_qa_spec() -> RunSpec:
|
|
|
865
1160
|
|
|
866
1161
|
|
|
867
1162
|
@run_spec_function("mental_health")
|
|
868
|
-
def get_mental_health_spec(data_path: str) -> RunSpec:
|
|
1163
|
+
def get_mental_health_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
|
|
869
1164
|
"""
|
|
870
1165
|
Returns the run specification for the mental health counseling scenario.
|
|
871
1166
|
This scenario evaluates a model's ability to generate appropriate counseling responses
|
|
@@ -886,8 +1181,15 @@ def get_mental_health_spec(data_path: str) -> RunSpec:
|
|
|
886
1181
|
max_tokens=512,
|
|
887
1182
|
stop_sequences=[],
|
|
888
1183
|
)
|
|
1184
|
+
annotator_models = get_annotator_models_from_config(jury_config_path)
|
|
1185
|
+
|
|
889
1186
|
annotator_specs = [
|
|
890
|
-
AnnotatorSpec(
|
|
1187
|
+
AnnotatorSpec(
|
|
1188
|
+
class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator",
|
|
1189
|
+
args={
|
|
1190
|
+
"annotator_models": annotator_models,
|
|
1191
|
+
},
|
|
1192
|
+
)
|
|
891
1193
|
]
|
|
892
1194
|
|
|
893
1195
|
metric_args = {
|
|
@@ -897,7 +1199,15 @@ def get_mental_health_spec(data_path: str) -> RunSpec:
|
|
|
897
1199
|
"rescale_with_baseline": False,
|
|
898
1200
|
}
|
|
899
1201
|
metric_specs = get_summarization_metric_specs(metric_args) + [
|
|
900
|
-
MetricSpec(
|
|
1202
|
+
MetricSpec(
|
|
1203
|
+
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
|
|
1204
|
+
args={
|
|
1205
|
+
"metric_name": "mental_health_accuracy",
|
|
1206
|
+
"scenario_name": "mental_health",
|
|
1207
|
+
"annotator_models": annotator_models,
|
|
1208
|
+
"default_score": 1.0,
|
|
1209
|
+
},
|
|
1210
|
+
)
|
|
901
1211
|
]
|
|
902
1212
|
|
|
903
1213
|
return RunSpec(
|
|
@@ -1217,7 +1527,7 @@ def get_shc_ent_spec(data_path: str) -> RunSpec:
|
|
|
1217
1527
|
@run_spec_function("shc_privacy_med")
|
|
1218
1528
|
def get_shc_privacy_spec(data_path: str) -> RunSpec:
|
|
1219
1529
|
scenario_spec = ScenarioSpec(
|
|
1220
|
-
class_name="helm.benchmark.scenarios.
|
|
1530
|
+
class_name="helm.benchmark.scenarios.shc_privacy_scenario.SHCPRIVACYMedScenario",
|
|
1221
1531
|
args={"data_path": data_path},
|
|
1222
1532
|
)
|
|
1223
1533
|
|
|
@@ -1240,7 +1550,7 @@ def get_shc_privacy_spec(data_path: str) -> RunSpec:
|
|
|
1240
1550
|
@run_spec_function("shc_proxy_med")
|
|
1241
1551
|
def get_shc_proxy_spec(data_path: str) -> RunSpec:
|
|
1242
1552
|
scenario_spec = ScenarioSpec(
|
|
1243
|
-
class_name="helm.benchmark.scenarios.
|
|
1553
|
+
class_name="helm.benchmark.scenarios.shc_proxy_scenario.SHCPROXYMedScenario",
|
|
1244
1554
|
args={"data_path": data_path},
|
|
1245
1555
|
)
|
|
1246
1556
|
|