crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MBZUAIHumanTranslatedArabicMMLUScenario(Scenario):
|
|
19
|
+
"""MBZUAI Human-Translated Arabic MMLU
|
|
20
|
+
|
|
21
|
+
A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark from this paper:
|
|
22
|
+
|
|
23
|
+
- https://arxiv.org/pdf/2009.03300.pdf
|
|
24
|
+
""" # noqa: E501
|
|
25
|
+
|
|
26
|
+
name = "mbzuai_human_translated_arabic_mmlu"
|
|
27
|
+
description = (
|
|
28
|
+
"A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark"
|
|
29
|
+
)
|
|
30
|
+
tags = ["knowledge", "multiple_choice"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, subject: str):
|
|
33
|
+
super().__init__()
|
|
34
|
+
self.subject: str = subject
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
cache_dir = os.path.join(output_path, "data")
|
|
38
|
+
ensure_directory_exists(cache_dir)
|
|
39
|
+
dataset = datasets.load_dataset(
|
|
40
|
+
"MBZUAI/human_translated_arabic_mmlu",
|
|
41
|
+
self.subject,
|
|
42
|
+
revision="5ed7830fd678cfa6f2d7f0a1a13a4e1a1fa422ac",
|
|
43
|
+
cache_dir=cache_dir,
|
|
44
|
+
split="test",
|
|
45
|
+
)
|
|
46
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
47
|
+
|
|
48
|
+
# Read all instances
|
|
49
|
+
instances: List[Instance] = []
|
|
50
|
+
for row_index, row in enumerate(dataset):
|
|
51
|
+
input = Input(text=row["question"])
|
|
52
|
+
references: List[Reference] = []
|
|
53
|
+
for choice_index, choice in enumerate(row["choices"]):
|
|
54
|
+
references.append(
|
|
55
|
+
Reference(
|
|
56
|
+
output=Output(text=choice),
|
|
57
|
+
tags=[CORRECT_TAG] if choice_index == row["answer"] else [],
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
instance = Instance(
|
|
61
|
+
id=f"id-{self.subject}-{row_index}",
|
|
62
|
+
input=input,
|
|
63
|
+
references=references,
|
|
64
|
+
split=TEST_SPLIT,
|
|
65
|
+
)
|
|
66
|
+
instances.append(instance)
|
|
67
|
+
|
|
68
|
+
return instances
|
|
@@ -2,8 +2,18 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
ALL_SPLITS,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class MedDialogScenario(Scenario):
|
|
@@ -133,3 +143,24 @@ class MedDialogScenario(Scenario):
|
|
|
133
143
|
)
|
|
134
144
|
|
|
135
145
|
return instances
|
|
146
|
+
|
|
147
|
+
def get_metadata(self):
|
|
148
|
+
return ScenarioMetadata(
|
|
149
|
+
name="med_dialog",
|
|
150
|
+
display_name="MedDialog",
|
|
151
|
+
short_display_name="MedDialog",
|
|
152
|
+
description="MedDialog is a benchmark of real-world doctor-patient conversations focused on "
|
|
153
|
+
"health-related concerns and advice. Each dialogue is paired with a "
|
|
154
|
+
"one-sentence summary that reflects the core patient question or exchange. The "
|
|
155
|
+
"benchmark evaluates a model's ability to condense medical dialogue into "
|
|
156
|
+
"concise, informative summaries.",
|
|
157
|
+
taxonomy=TaxonomyInfo(
|
|
158
|
+
task="Text generation",
|
|
159
|
+
what="Generate summaries of doctor-patient conversations",
|
|
160
|
+
when="Any",
|
|
161
|
+
who="Clinician",
|
|
162
|
+
language="English",
|
|
163
|
+
),
|
|
164
|
+
main_metric="med_dialog_accuracy",
|
|
165
|
+
main_split="test",
|
|
166
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
VALID_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -109,3 +111,15 @@ class MedMCQAScenario(Scenario):
|
|
|
109
111
|
instances.append(instance)
|
|
110
112
|
|
|
111
113
|
return instances
|
|
114
|
+
|
|
115
|
+
def get_metadata(self):
|
|
116
|
+
return ScenarioMetadata(
|
|
117
|
+
name="med_mcqa",
|
|
118
|
+
display_name="MedMCQA",
|
|
119
|
+
description='MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to '
|
|
120
|
+
"address real-world medical entrance exam questions ([Flores et al. "
|
|
121
|
+
"2020](https://arxiv.org/abs/2203.14371)).",
|
|
122
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
123
|
+
main_metric="exact_match",
|
|
124
|
+
main_split="valid",
|
|
125
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
VALID_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -103,3 +105,21 @@ class MedQAScenario(Scenario):
|
|
|
103
105
|
instances.append(instance)
|
|
104
106
|
|
|
105
107
|
return instances
|
|
108
|
+
|
|
109
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
110
|
+
return ScenarioMetadata(
|
|
111
|
+
name="med_qa",
|
|
112
|
+
display_name="MedQA",
|
|
113
|
+
description="MedQA is an open domain question answering dataset composed of questions from "
|
|
114
|
+
"professional medical board exams ([Jin et al. "
|
|
115
|
+
"2020](https://arxiv.org/pdf/2009.13081.pdf)).",
|
|
116
|
+
taxonomy=TaxonomyInfo(
|
|
117
|
+
task="multiple-choice question answering",
|
|
118
|
+
what="US medical licensing exams",
|
|
119
|
+
when="before 2020",
|
|
120
|
+
who="problem setters",
|
|
121
|
+
language="English",
|
|
122
|
+
),
|
|
123
|
+
main_metric="quasi_exact_match",
|
|
124
|
+
main_split="test",
|
|
125
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
3
4
|
from helm.benchmark.scenarios.scenario import (
|
|
4
5
|
Scenario,
|
|
5
6
|
Instance,
|
|
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
PassageQuestionInput,
|
|
10
11
|
Output,
|
|
12
|
+
ScenarioMetadata,
|
|
11
13
|
)
|
|
12
14
|
from helm.benchmark.scenarios.medalign_scenario_helper import return_dataset_dataframe # type: ignore
|
|
13
15
|
|
|
@@ -92,3 +94,24 @@ class MedalignScenario(Scenario):
|
|
|
92
94
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
93
95
|
dataset = return_dataset_dataframe(self.max_length, self.data_path)
|
|
94
96
|
return self.process_tsv(dataset)
|
|
97
|
+
|
|
98
|
+
def get_metadata(self):
|
|
99
|
+
return ScenarioMetadata(
|
|
100
|
+
name="medalign",
|
|
101
|
+
display_name="MedAlign",
|
|
102
|
+
short_display_name="MedAlign",
|
|
103
|
+
description="MedAlign is a benchmark that evaluates a model's ability to interpret and "
|
|
104
|
+
"follow instructions grounded in longitudinal electronic health records (EHR). "
|
|
105
|
+
"Each instance includes an event-stream style patient record and a natural "
|
|
106
|
+
"language question or task, requiring clinically informed reading comprehension "
|
|
107
|
+
"and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).",
|
|
108
|
+
taxonomy=TaxonomyInfo(
|
|
109
|
+
task="Text generation",
|
|
110
|
+
what="Answer questions and follow instructions over longitudinal EHR",
|
|
111
|
+
when="Any",
|
|
112
|
+
who="Clinician, Researcher",
|
|
113
|
+
language="English",
|
|
114
|
+
),
|
|
115
|
+
main_metric="medalign_accuracy",
|
|
116
|
+
main_split="test",
|
|
117
|
+
)
|
|
@@ -2,22 +2,13 @@
|
|
|
2
2
|
# type: ignore
|
|
3
3
|
# fmt: off
|
|
4
4
|
|
|
5
|
-
import ast
|
|
6
|
-
import datetime
|
|
7
5
|
import transformers
|
|
8
|
-
import langchain
|
|
9
|
-
import langchain.prompts
|
|
10
|
-
import lxml.etree
|
|
11
6
|
import os
|
|
12
7
|
import pandas as pd
|
|
13
|
-
import re
|
|
14
8
|
import tiktoken
|
|
15
9
|
|
|
16
|
-
from langchain_community.retrievers import BM25Retriever
|
|
17
10
|
from tqdm import tqdm
|
|
18
|
-
from typing import Any, Dict, Optional,
|
|
19
|
-
from langchain.schema import Document
|
|
20
|
-
import langchain_community
|
|
11
|
+
from typing import Any, Dict, Optional, Callable
|
|
21
12
|
|
|
22
13
|
from helm.common.general import check_file_exists
|
|
23
14
|
|
|
@@ -167,102 +158,13 @@ def get_tokenizer(tokenizer_name: str) -> Callable:
|
|
|
167
158
|
return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
|
|
168
159
|
|
|
169
160
|
|
|
170
|
-
def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
|
|
171
|
-
"""
|
|
172
|
-
Retrieve and filter relevant EHR visits based on a query and target length.
|
|
173
|
-
|
|
174
|
-
This function retrieves electronic health record (EHR) visit strings, sorts them
|
|
175
|
-
by relevance using the BM25Retriever, and constructs a list of final documents
|
|
176
|
-
that fit within a specified character length. The final list ensures that the
|
|
177
|
-
most important visit isn't cut off and is sorted chronologically.
|
|
178
|
-
|
|
179
|
-
Parameters:
|
|
180
|
-
ehr_visit_strs (list of str): List of EHR visit strings.
|
|
181
|
-
query (str): Query string to retrieve relevant visits.
|
|
182
|
-
target_length (int): Maximum total token count for the final list of documents.
|
|
183
|
-
tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
|
|
184
|
-
|
|
185
|
-
Returns:
|
|
186
|
-
list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
|
|
187
|
-
"""
|
|
188
|
-
ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
|
|
189
|
-
langchain_docs = [
|
|
190
|
-
langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
|
|
191
|
-
]
|
|
192
|
-
# `k` is the number of documents to retrieve
|
|
193
|
-
# We retrieve everything and just use the BM25Retriever to sort the documents
|
|
194
|
-
retriever = langchain_community.retrievers.BM25Retriever.from_documents(
|
|
195
|
-
langchain_docs, k=len(langchain_docs)
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Invoking the retriever means the most relevant documents are sorted first
|
|
199
|
-
sorted_docs = retriever.invoke(query)
|
|
200
|
-
|
|
201
|
-
# Define the regex pattern to find the start time
|
|
202
|
-
# pattern = r'start="([\d/]+ [\d:]+)"'
|
|
203
|
-
pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
|
|
204
|
-
|
|
205
|
-
docs = []
|
|
206
|
-
dts = []
|
|
207
|
-
|
|
208
|
-
# Find the startime of the document
|
|
209
|
-
for doc in sorted_docs:
|
|
210
|
-
doc_content = doc.page_content
|
|
211
|
-
start_dt_match = re.search(pattern, doc_content)
|
|
212
|
-
if start_dt_match:
|
|
213
|
-
start_dt = start_dt_match.group(1)
|
|
214
|
-
parsed = False
|
|
215
|
-
# Try different date formats
|
|
216
|
-
for fmt in (
|
|
217
|
-
"%m/%d/%y %I:%M %p",
|
|
218
|
-
"%m/%d/%Y %I:%M %p",
|
|
219
|
-
"%m/%d/%y %H:%M",
|
|
220
|
-
"%m/%d/%Y %H:%M",
|
|
221
|
-
):
|
|
222
|
-
try:
|
|
223
|
-
dts.append(datetime.datetime.strptime(start_dt, fmt))
|
|
224
|
-
parsed = True
|
|
225
|
-
break
|
|
226
|
-
except ValueError:
|
|
227
|
-
continue
|
|
228
|
-
if not parsed:
|
|
229
|
-
print(f"Error parsing date: {start_dt}")
|
|
230
|
-
continue
|
|
231
|
-
else:
|
|
232
|
-
print(f"Start time not found., {doc_content}")
|
|
233
|
-
dts.append(datetime.datetime.min)
|
|
234
|
-
docs.append(doc_content)
|
|
235
|
-
|
|
236
|
-
final_docs = []
|
|
237
|
-
current_length = 0
|
|
238
|
-
|
|
239
|
-
# Add documents until we exceed the allocated context length
|
|
240
|
-
for i in range(len(docs)):
|
|
241
|
-
doc_content = docs[i]
|
|
242
|
-
doc_length = len(tokenizer.encode(doc_content))
|
|
243
|
-
final_docs.append((dts[i], doc_content))
|
|
244
|
-
current_length += doc_length
|
|
245
|
-
if current_length > target_length:
|
|
246
|
-
break
|
|
247
|
-
|
|
248
|
-
# Sort final_docs chronologically
|
|
249
|
-
final_docs.sort(key=lambda x: x[0])
|
|
250
|
-
|
|
251
|
-
# Extract only the document content for the final output
|
|
252
|
-
final_docs_content = [doc_content for _, doc_content in final_docs]
|
|
253
|
-
|
|
254
|
-
return final_docs_content
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
161
|
def pack_and_trim_prompts(
|
|
259
162
|
instructions: Dict[int, Dict[str, str]],
|
|
260
163
|
ehrs: Dict[int, str],
|
|
261
|
-
|
|
164
|
+
prompt_string: str,
|
|
262
165
|
context_length: int,
|
|
263
166
|
generation_length: int,
|
|
264
167
|
tokenizer: Any,
|
|
265
|
-
use_RAG: bool = True,
|
|
266
168
|
verbose: bool = False,
|
|
267
169
|
include_ehr: bool = True,
|
|
268
170
|
) -> Dict[int, str]:
|
|
@@ -276,26 +178,15 @@ def pack_and_trim_prompts(
|
|
|
276
178
|
patient_id = int(instructions[instruction_id]["patient_id"])
|
|
277
179
|
relevant_ehr = ehrs[patient_id]
|
|
278
180
|
|
|
279
|
-
# Calculate how many tokens of EHR we can include in the prompt
|
|
280
181
|
num_tokens_instruction = len(tokenizer.encode(instruction))
|
|
281
|
-
num_tokens_prompt_template = len(tokenizer.encode(
|
|
182
|
+
num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
|
|
282
183
|
if include_ehr:
|
|
283
184
|
target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
|
|
284
185
|
else:
|
|
285
186
|
target_ehr_length = 0
|
|
286
187
|
if target_ehr_length <= 0:
|
|
287
|
-
prompt_with_truncated_ehr =
|
|
188
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
|
|
288
189
|
else:
|
|
289
|
-
if use_RAG:
|
|
290
|
-
# Return a list of the most relevant visit strings
|
|
291
|
-
most_relevant_visits = retrieve_most_relevant_visits(
|
|
292
|
-
ehr_visit_strs=relevant_ehr,
|
|
293
|
-
query=instruction,
|
|
294
|
-
target_length=target_ehr_length,
|
|
295
|
-
tokenizer=tokenizer,
|
|
296
|
-
)
|
|
297
|
-
relevant_ehr = "\n".join(most_relevant_visits)
|
|
298
|
-
|
|
299
190
|
# Do a first pass with a fast tokenizer
|
|
300
191
|
fast_tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
301
192
|
fast_encoded = fast_tokenizer.encode(relevant_ehr)
|
|
@@ -307,13 +198,17 @@ def pack_and_trim_prompts(
|
|
|
307
198
|
encoded_ehr = tokenizer.encode(fast_truncated_ehr)
|
|
308
199
|
truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
|
|
309
200
|
truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
|
|
310
|
-
prompt_with_truncated_ehr =
|
|
201
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
|
|
202
|
+
else:
|
|
203
|
+
# If the fast encoding is still too long, just use the full EHR up to allowed length
|
|
204
|
+
truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
|
|
205
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
|
|
311
206
|
|
|
312
|
-
|
|
207
|
+
prompts_map[instruction_id] = prompt_with_truncated_ehr
|
|
313
208
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
209
|
+
if verbose:
|
|
210
|
+
print(prompt_with_truncated_ehr)
|
|
211
|
+
print("~" * 20)
|
|
317
212
|
return prompts_map
|
|
318
213
|
|
|
319
214
|
|
|
@@ -322,7 +217,6 @@ def preprocess_prompts(
|
|
|
322
217
|
generation_length,
|
|
323
218
|
path_to_instructions,
|
|
324
219
|
path_to_ehrs,
|
|
325
|
-
use_RAG,
|
|
326
220
|
include_ehr,
|
|
327
221
|
tokenizer,
|
|
328
222
|
codes_only=False,
|
|
@@ -347,16 +241,18 @@ def preprocess_prompts(
|
|
|
347
241
|
|
|
348
242
|
# CONSTRUCT & TRUNCATE PROMPTS #
|
|
349
243
|
print("Constructing prompts using instructions and EHRs...")
|
|
350
|
-
prompt_string=
|
|
351
|
-
|
|
244
|
+
prompt_string = (
|
|
245
|
+
"Instruction: Answer the following question based on the EHR:\n\n"
|
|
246
|
+
"EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
|
|
247
|
+
)
|
|
248
|
+
|
|
352
249
|
filled_prompts = pack_and_trim_prompts(
|
|
353
250
|
instructions=instructions,
|
|
354
251
|
ehrs=ehrs,
|
|
355
|
-
|
|
252
|
+
prompt_string=prompt_string,
|
|
356
253
|
context_length=target_context_length,
|
|
357
254
|
generation_length=generation_length,
|
|
358
255
|
tokenizer=tokenizer,
|
|
359
|
-
use_RAG=use_RAG,
|
|
360
256
|
verbose=False,
|
|
361
257
|
include_ehr=include_ehr,
|
|
362
258
|
)
|
|
@@ -415,7 +311,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
|
|
|
415
311
|
path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
|
|
416
312
|
path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
|
|
417
313
|
check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
|
|
418
|
-
use_RAG = False
|
|
419
314
|
include_ehr = True
|
|
420
315
|
tokenizer = "tiktoken"
|
|
421
316
|
|
|
@@ -424,7 +319,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
|
|
|
424
319
|
generation_length=generation_length,
|
|
425
320
|
path_to_instructions=path_to_instructions,
|
|
426
321
|
path_to_ehrs=path_to_ehrs,
|
|
427
|
-
use_RAG=use_RAG,
|
|
428
322
|
include_ehr=include_ehr,
|
|
429
323
|
tokenizer=tokenizer,
|
|
430
324
|
)
|
|
@@ -3,6 +3,7 @@ import csv
|
|
|
3
3
|
import sys
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
CORRECT_TAG,
|
|
8
9
|
TEST_SPLIT,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Output,
|
|
12
13
|
Reference,
|
|
13
14
|
Scenario,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_file_downloaded
|
|
16
18
|
|
|
@@ -143,3 +145,23 @@ class MedBulletsScenario(Scenario):
|
|
|
143
145
|
csv_path = self.download_csv(output_path, split_suffix)
|
|
144
146
|
instances.extend(self.process_csv(csv_path, split))
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self):
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="medbullets",
|
|
152
|
+
display_name="Medbullets",
|
|
153
|
+
description="Medbullets is a benchmark of USMLE-style medical questions designed to assess "
|
|
154
|
+
"a model's ability to understand and apply clinical knowledge. Each question is "
|
|
155
|
+
"accompanied by a patient scenario and five multiple-choice options, similar to "
|
|
156
|
+
"those found on Step 2 and Step 3 board exams [(MedBullets, "
|
|
157
|
+
"2025)](https://step2.medbullets.com).",
|
|
158
|
+
taxonomy=TaxonomyInfo(
|
|
159
|
+
task="Question answering",
|
|
160
|
+
what="Medical knowledge testing",
|
|
161
|
+
when="Any",
|
|
162
|
+
who="Medical student, . Researcher",
|
|
163
|
+
language="English",
|
|
164
|
+
),
|
|
165
|
+
main_metric="exact_match",
|
|
166
|
+
main_split="test",
|
|
167
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Dict, List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.hierarchical_logger import hlog
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
PassageQuestionInput,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -125,3 +127,23 @@ class MedCalcBenchScenario(Scenario):
|
|
|
125
127
|
instances.extend(self.process_csv(data, split))
|
|
126
128
|
|
|
127
129
|
return instances
|
|
130
|
+
|
|
131
|
+
def get_metadata(self):
|
|
132
|
+
return ScenarioMetadata(
|
|
133
|
+
name="medcalc_bench",
|
|
134
|
+
display_name="MedCalc-Bench",
|
|
135
|
+
description="MedCalc-Bench is a benchmark designed to evaluate models on their ability to "
|
|
136
|
+
"compute clinically relevant values from patient notes. Each instance consists "
|
|
137
|
+
"of a clinical note describing the patient's condition, a diagnostic question "
|
|
138
|
+
"targeting a specific medical value, and a ground truth response. [(Khandekar "
|
|
139
|
+
"et al., 2024)](https://arxiv.org/abs/2406.12036).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="Computational reasoning",
|
|
142
|
+
what="Compute a specific medical value from a patient note",
|
|
143
|
+
when="Any",
|
|
144
|
+
who="Clinician, Researcher",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="medcalc_bench_accuracy",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
TEST_SPLIT,
|
|
10
11
|
Input,
|
|
11
12
|
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_file_downloaded
|
|
14
16
|
|
|
@@ -123,3 +125,24 @@ class MedecScenario(Scenario):
|
|
|
123
125
|
instances.extend(self.process_csv(test_csv, TEST_SPLIT))
|
|
124
126
|
|
|
125
127
|
return instances
|
|
128
|
+
|
|
129
|
+
def get_metadata(self):
|
|
130
|
+
return ScenarioMetadata(
|
|
131
|
+
name="medec",
|
|
132
|
+
display_name="Medec",
|
|
133
|
+
description="Medec is a benchmark composed of clinical narratives that include either "
|
|
134
|
+
"correct documentation or medical errors. Each entry includes sentence-level "
|
|
135
|
+
"identifiers and an associated correction task. The model must review the "
|
|
136
|
+
"narrative and either identify the erroneous sentence and correct it, or "
|
|
137
|
+
"confirm that the text is entirely accurate [(Abacha et al., "
|
|
138
|
+
"2025)](https://arxiv.org/abs/2412.19260).",
|
|
139
|
+
taxonomy=TaxonomyInfo(
|
|
140
|
+
task="Classification",
|
|
141
|
+
what="Detect and correct errors in medical narratives",
|
|
142
|
+
when="Any",
|
|
143
|
+
who="Researcher, Clinician",
|
|
144
|
+
language="English",
|
|
145
|
+
),
|
|
146
|
+
main_metric="medec_error_flag_accuracy",
|
|
147
|
+
main_split="test",
|
|
148
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
CORRECT_TAG,
|
|
10
11
|
Output,
|
|
11
12
|
Input,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
|
|
14
16
|
|
|
@@ -70,3 +72,24 @@ Answer: {answer}
|
|
|
70
72
|
)
|
|
71
73
|
instances.append(hallucinated_instance)
|
|
72
74
|
return instances
|
|
75
|
+
|
|
76
|
+
def get_metadata(self):
|
|
77
|
+
return ScenarioMetadata(
|
|
78
|
+
name="medhallu",
|
|
79
|
+
display_name="MedHallu",
|
|
80
|
+
description="MedHallu is a benchmark focused on evaluating factual correctness in "
|
|
81
|
+
"biomedical question answering. Each instance contains a PubMed-derived "
|
|
82
|
+
"knowledge snippet, a biomedical question, and a model-generated answer. The "
|
|
83
|
+
"task is to classify whether the answer is factually correct or contains "
|
|
84
|
+
"hallucinated (non-grounded) information. This benchmark is designed to assess "
|
|
85
|
+
"the factual reliability of medical language models.",
|
|
86
|
+
taxonomy=TaxonomyInfo(
|
|
87
|
+
task="Classification",
|
|
88
|
+
what="Verify whether answers to questions from PubMed articles are " "factual or hallucinated",
|
|
89
|
+
when="Any",
|
|
90
|
+
who="Researcher",
|
|
91
|
+
language="English",
|
|
92
|
+
),
|
|
93
|
+
main_metric="exact_match",
|
|
94
|
+
main_split="test",
|
|
95
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# The judges to be used for evaluating the note summary scenario.
|
|
2
|
+
# name: The short name for the judge.
|
|
3
|
+
# model: The field value matching the 'model_name' field under model_deployments.yaml
|
|
4
|
+
# model_deployment: The field value matching the 'name' under model_deployments.yaml.
|
|
5
|
+
judges:
|
|
6
|
+
- name: "gpt"
|
|
7
|
+
model: "openai/gpt-4o-2024-05-13"
|
|
8
|
+
model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13"
|
|
9
|
+
- name: "llama"
|
|
10
|
+
model: "meta/llama-3.3-70b-instruct"
|
|
11
|
+
model_deployment: "stanfordhealthcare/llama-3.3-70b-instruct"
|
|
12
|
+
- name: "claude"
|
|
13
|
+
model: "anthropic/claude-3-7-sonnet-20250219"
|
|
14
|
+
model_deployment: "stanfordhealthcare/claude-3-7-sonnet-20250219"
|