PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/benchmark/scenarios/medhelm_configurable_scenario.py ADDED Viewed

@@ -0,0 +1,101 @@
+import string
+import json
+import pandas as pd
+from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.run_specs.medhelm.benchmark_config import get_benchmark_config_from_path
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    CORRECT_TAG,
+    Reference,
+    Input,
+    Output,
+    TEST_SPLIT,
+    ScenarioMetadata,
+)
+class MedHELMConfigurableScenario(Scenario):
+    """
+    MedHELM configuratble scenario
+    """
+    tags = ["biomedical"]
+    def __init__(self, name: str, config_path: str):
+        super().__init__()
+        self.benchmark_config = get_benchmark_config_from_path(config_path)
+        self.name = name
+        self.description = self.benchmark_config.description
+    def get_columns_in_template(self, template: str) -> List[str]:
+        """
+        Extract field names from a template string using Python's Formatter.
+        Example: "Name: {name}, Age: {age}" → ["name", "age"]
+        """
+        formatter = string.Formatter()
+        return [fname for _, fname, _, _ in formatter.parse(template) if fname]
+    def populate_template(self, template: str, row: pd.Series, fields: List[str]) -> str:
+        """
+        Populate the template with values from the row using format_map.
+        Missing fields default to empty string.
+        """
+        mapping = {field: row.get(field, "") for field in fields}
+        return template.format_map(mapping)
+    def get_references(self, row: pd.Series) -> List[Reference]:
+        references: List[Reference] = []
+        if "correct_answer" in row:
+            references.append(Reference(Output(text=row["correct_answer"]), tags=[CORRECT_TAG]))
+        if "incorrect_answers" in row:
+            for incorrect_answer in row["incorrect_answers"]:
+                references.append(Reference(Output(text=incorrect_answer), tags=[]))
+        return references
+    def get_instances(self, output_path: str) -> List[Instance]:
+        check_file_exists(self.benchmark_config.prompt_file, msg=f"Prompt file for {self.name} does not exist")
+        check_file_exists(self.benchmark_config.dataset_file, msg=f"Dataset file for {self.name} does not exist")
+        instances: List[Instance] = []
+        df = pd.read_csv(self.benchmark_config.dataset_file)
+        if "correct_answer" not in df.columns:
+            if not self._is_llm_as_judge() or len(self.benchmark_config.metrics) > 1:
+                raise ValueError(
+                    "Dataset must contain 'correct_answer' column unless using jury_score as the only metric."
+                )
+        if "incorrect_answers" in df.columns:
+            df["incorrect_answers"] = df["incorrect_answers"].apply(json.loads)
+        with open(self.benchmark_config.prompt_file, "r") as f:
+            template = f.read()
+        fields = self.get_columns_in_template(template)
+        for _, row in df.iterrows():
+            filled = self.populate_template(template, row, fields)
+            prompt = Input(text=filled)
+            instances.append(Instance(input=prompt, references=self.get_references(row), split=TEST_SPLIT))
+        return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name=self.name,
+            display_name=self.name,
+            description=self.description,
+            taxonomy=TaxonomyInfo(
+                task="",
+                what="",
+                when="",
+                who="",
+                language="",
+            ),
+            main_metric=self.benchmark_config.main_metric.name,
+            main_split="test",
+        )
+    def _is_llm_as_judge(self) -> bool:
+        for metric in self.benchmark_config.metrics:
+            if metric.name == "jury_score":
+                return True
+        return False

helm/benchmark/scenarios/medi_qa_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, List
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -49,7 +51,7 @@ class MediQAScenario(Scenario):
     name = "medi_qa"
     description = (
-        "MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate"
+        "MEDIQA is a benchmark designed to evaluate a model's ability to generate"
         "medically accurate answers to patient-generated questions. Each instance includes a"
         "consumer health question, a set of candidate answers (used in ranking tasks), relevance"
         "annotations, and optionally, additional context. The benchmark focuses on supporting"
@@ -109,3 +111,24 @@ class MediQAScenario(Scenario):
             instances.extend(self.process_csv(data, split))
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medi_qa",
+            display_name="MEDIQA",
+            description="MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and "
+            "generate medically accurate answers to patient-generated questions. Each "
+            "instance includes a consumer health question, a set of candidate answers (used "
+            "in ranking tasks), relevance annotations, and optionally, additional context. "
+            "The benchmark focuses on supporting patient understanding and accessibility in "
+            "health communication.",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate medically accurate answers to patient-generated questions.",
+                when="Any",
+                who="Clinician, Medical Student",
+                language="English",
+            ),
+            main_metric="medi_qa_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/medication_qa_scenario.py CHANGED Viewed

@@ -3,9 +3,19 @@ from typing import List
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+    ScenarioMetadata,
+)
 class MedicationQAScenario(Scenario):
@@ -64,3 +74,23 @@ class MedicationQAScenario(Scenario):
         ]
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medication_qa",
+            display_name="MedicationQA",
+            description="MedicationQA is a benchmark composed of open-ended consumer health questions "
+            "specifically focused on medications. Each example consists of a free-form "
+            "question and a corresponding medically grounded answer. The benchmark "
+            "evaluates a model's ability to provide accurate, accessible, and informative "
+            "medication-related responses for a lay audience.",
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Answer consumer medication-related questions",
+                when="Any",
+                who="Patient, Pharmacist",
+                language="English",
+            ),
+            main_metric="medication_qa_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/melt_scenarios.py CHANGED Viewed

@@ -439,13 +439,13 @@ class MELTMATHScenario(Scenario):
         for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
             if split == TRAIN_SPLIT and self.use_official_examples:
                 train_instances = [
-                    ("Kết quả của $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
+                    ("Kết quả của $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
                     (
                         "Có bao nhiêu cách chọn 4 quyển sách từ một kệ sách có 6 quyển,"
                         + " nếu thứ tự các cuốn sách được chọn không quan trọng?",
                         "15",
                     ),
-                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\sqrt{59}"),
+                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\\sqrt{59}"),
                     (
                         "Các mặt của khối xúc xắc bát diện được dán nhãn bằng các số từ $1$ đến $8$."
                         + " Xác suất tung một cặp xúc xắc bát diện để được tổng số bằng $15$ là bao nhiêu?"

helm/benchmark/scenarios/mental_health_scenario.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pandas as pd
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     PassageQuestionInput,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import check_file_exists
@@ -121,3 +123,24 @@ class MentalHealthScenario(Scenario):
         instances = self.process_dialogue_data(dialogue_data)
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="mental_health",
+            display_name="MentalHealth",
+            description="MentalHealth is a benchmark focused on evaluating empathetic communication in "
+            "mental health counseling. It includes real or simulated conversations between "
+            "patients and counselors, where the task is to generate compassionate and "
+            "appropriate counselor responses. The benchmark assesses a model's ability to "
+            "support patients emotionally and meaningfully engage in therapeutic "
+            "conversations.",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate empathetic counseling responses in mental health " "conversations",
+                when="Any",
+                who="Counselors, Patients",
+                language="English",
+            ),
+            main_metric="mental_health_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/mimic_bhc_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -10,11 +11,12 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
 class MIMICBHCScenario(Scenario):
-    """
+    r"""
     MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
     course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
@@ -101,3 +103,25 @@ class MIMICBHCScenario(Scenario):
                 )
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="mimic_bhc",
+            display_name="MIMIC-IV-BHC",
+            short_display_name="MIMIC-BHC",
+            description="MIMIC-BHC is a benchmark focused on summarization of discharge notes into "
+            "Brief Hospital Course (BHC) sections. It consists of curated discharge notes "
+            "from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark "
+            "evaluates a model's ability to condense detailed clinical information into "
+            "accurate, concise summaries that reflect the patient's hospital stay [(Aali et "
+            "al., 2024)](https://doi.org/10.1093/jamia/ocae312).",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Summarize the clinical note into a brief hospital course",
+                when="Upon hospital discharge",
+                who="Clinician",
+                language="English",
+            ),
+            main_metric="mimic_bhc_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/mimic_rrs_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
@@ -96,3 +98,24 @@ class MIMICRRSScenario(Scenario):
             lines = file.readlines()
         lines = [line.strip() for line in lines]
         return lines
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="mimic_rrs",
+            display_name="MIMIC-RRS",
+            short_display_name="MIMIC-RRS",
+            description="MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III "
+            "database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling "
+            "evaluation of a model's ability to summarize diagnostic imaging observations "
+            "into concise, clinically relevant conclusions [(Chen et al., "
+            "2023)](https://arxiv.org/abs/2211.08584).",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate radiology report summaries from findings sections",
+                when="Post-imaging",
+                who="Radiologist",
+                language="English",
+            ),
+            main_metric="mimic_rrs_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/mimiciv_billing_code_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import pandas as pd
 import numpy as np
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import check_file_exists
 from helm.benchmark.scenarios.scenario import (
     Input,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Reference,
     Output,
+    ScenarioMetadata,
 )
@@ -75,3 +77,23 @@ class MIMICIVBillingCodeScenario(Scenario):
                 continue
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="mimiciv_billing_code",
+            display_name="MIMIC-IV Billing Code",
+            description="MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the "
+            "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The "
+            "task requires models to extract structured billing codes based on free-text "
+            "clinical notes, reflecting real-world hospital coding tasks for financial "
+            "reimbursement.",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Predict ICD-10 billing codes from clinical discharge notes",
+                when="During or after patient discharge",
+                who="Hospital Admistrator",
+                language="English",
+            ),
+            main_metric="mimiciv_billing_code_f1",
+            main_split="test",
+        )

helm/benchmark/scenarios/mmlu_pro_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Dict, List
 from datasets import Dataset, load_dataset
 from helm.common.hierarchical_logger import hlog
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -93,3 +95,19 @@ class MMLUProScenario(Scenario):
             instances.extend(self.process_dataset(data, split))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="MMLU-Pro",
+            description=self.description,
+            main_metric="chain_of_thought_correctness",
+            main_split="test",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="graduate-level questions in biology, physics, and chemistry",
+                who="domain experts",
+                when="2023",
+                language="English",
+            ),
+        )

helm/benchmark/scenarios/mmlu_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import csv
 import os
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -124,3 +126,22 @@ class MMLUScenario(Scenario):
             instances.extend(self.process_csv(csv_path, splits[split]))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="mmlu",
+            display_name="MMLU (Massive Multitask Language Understanding)",
+            short_display_name="MMLU",
+            description="The Massive Multitask Language Understanding (MMLU) benchmark for "
+            "knowledge-intensive question answering across 57 domains [(Hendrycks et al., "
+            "2021)](https://arxiv.org/pdf/2009.03300.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="math, science, history, etc.",
+                when="before 2021",
+                who="various online sources",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/mmmlu_scenario.py ADDED Viewed

@@ -0,0 +1,85 @@
+import os
+from typing import List
+import datasets
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class MMMLUScenario(Scenario):
+    """Multilingual Massive Multitask Language Understanding (MMMLU) by OpenAI
+    The MMLU is a widely recognized benchmark of general knowledge attained
+    by AI models. It covers a broad range of topics from 57 different categories,
+    covering elementary-level knowledge up to advanced professional subjects like
+    law, physics, history, and computer science.
+    MMMLU is a translation of MMLU’s test set into 14 languages using professional
+    human translators. Relying on human translators for this evaluation increases
+    confidence in the accuracy of the translations, especially for low-resource
+    languages like Yoruba.
+    The Massive Multitask Language Understanding benchmark from this paper:
+    - https://arxiv.org/pdf/2009.03300.pdf
+    The MMMLU dataset is from here:
+    - https://huggingface.co/datasets/openai/MMMLU
+    """
+    name = "mmmlu"
+    description = "Multilingual Massive Multitask Language Understanding"
+    tags = ["knowledge", "multiple_choice"]
+    OPTIONS = ["A", "B", "C", "D"]
+    def __init__(self, locale: str, subject: str):
+        super().__init__()
+        self.locale: str = locale
+        self.subject: str = subject
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "openai/MMMLU",
+            self.locale,
+            revision="325a01dc3e173cac1578df94120499aaca2e2504",
+            cache_dir=cache_dir,
+            split="test",
+        )
+        assert isinstance(dataset, datasets.Dataset)
+        # Read all instances
+        instances: List[Instance] = []
+        for row_index, row in enumerate(dataset):
+            if self.subject != "all" and row["Subject"] != self.subject:
+                continue
+            input = Input(text=row["Question"])
+            references: List[Reference] = []
+            for option in self.OPTIONS:
+                references.append(
+                    Reference(
+                        output=Output(text=row[option]),
+                        tags=[CORRECT_TAG] if option == row["Answer"] else [],
+                    )
+                )
+            instance = Instance(
+                id=f"id{row_index}",
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/msmarco_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ import random
 from collections import defaultdict
 from typing import Dict, List, Tuple, Optional, Union
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     TRAIN_SPLIT,
     VALID_SPLIT,
     CORRECT_TAG,
+    ScenarioMetadata,
     make_rank_tag,
     make_relevance_tag,
     Input,
@@ -657,3 +659,31 @@ class MSMARCOScenario(Scenario):
         valid_instances = self.get_valid_instances()
         return train_instances + valid_instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.track == self.REGULAR_TRACK:
+            return ScenarioMetadata(
+                name="msmarco_regular",
+                display_name="MS MARCO (regular track)",
+                short_display_name="MS MARCO (regular)",
+                description="The MS MARCO benchmark's regular track for passage retrieval in information "
+                "retrieval "
+                "[(https://microsoft.github.io/msmarco/)](https://microsoft.github.io/msmarco/).",
+                taxonomy=TaxonomyInfo(task="information retrieval", what="?", when="?", who="?", language="English"),
+                main_metric="RR@10",
+                main_split="valid",
+            )
+        elif self.track == self.TREC_TRACK:
+            return ScenarioMetadata(
+                name="msmarco_trec",
+                display_name="MS MARCO (TREC track)",
+                short_display_name="MS MARCO (TREC)",
+                description="The MS MARCO benchmark's deep learning TREC track for passage retrieval in "
+                "information retrieval "
+                "[(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).",
+                taxonomy=TaxonomyInfo(task="information retrieval", what="?", when="?", who="?", language="English"),
+                main_metric="NDCG@10",
+                main_split="valid",
+            )
+        else:
+            raise Exception(f"Unknown track {self.track}")

helm/benchmark/scenarios/mtsamples_procedures_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import requests
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -142,3 +144,23 @@ class MTSamplesProceduresScenario(Scenario):
                 print(f"Error processing {file_name}: {e}")
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="mtsamples_procedures",
+            display_name="MTSamples Procedures",
+            description="MTSamples Procedures is a benchmark composed of transcribed operative notes, "
+            "focused on documenting surgical procedures. Each example presents a brief "
+            "patient case involving a surgical intervention, and the model is tasked with "
+            "generating a coherent and clinically accurate procedural summary or treatment "
+            "plan.",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Document and extract information about medical procedures",
+                when="Post-procedure",
+                who="Clinician, Researcher",
+                language="English",
+            ),
+            main_metric="mtsamples_procedures_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/mtsamples_replicate_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import requests
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -140,3 +142,23 @@ class MTSamplesReplicateScenario(Scenario):
                 print(f"Error processing {file_name}: {e}")
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="mtsamples_replicate",
+            display_name="MTSamples",
+            short_display_name="MTSamples",
+            description="MTSamples Replicate is a benchmark that provides transcribed medical reports "
+            "from various specialties. It is used to evaluate a model's ability to generate "
+            "clinically appropriate treatment plans based on unstructured patient "
+            "documentation [(MTSamples, 2025)](https://mtsamples.com).",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Generate treatment plans based on clinical notes",
+                when="Post-diagnosis",
+                who="Clinician",
+                language="English",
+            ),
+            main_metric="mtsamples_replicate_accuracy",
+            main_split="test",
+        )

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl