PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/benchmark/metrics/kpi_edgar_metrics.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
+from helm.benchmark.metrics.metric import MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.kpi_edgar_scenario import KPIEDGARScenario
@@ -119,3 +120,23 @@ class KPIEdgarMetric(EvaluateInstancesMetric):
     def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
         return _compute_stats(_request_states_to_pred_gold_pairs(request_states))
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="word_macro_f1_score",
+                display_name="Word F1 Score",
+                description="Word F1 Score",
+                lower_is_better=None,
+                group=None,
+            ),
+            MetricMetadata(
+                name="adjusted_macro_f1_score",
+                display_name="Adjusted Macro F1 Score",
+                short_display_name="Adjusted Macro F1 Score",
+                description="Entity type classification F1 score, adjusted for partial matches following the KPI-Edgar "
+                "paper, macro-averaged across entity types",
+                lower_is_better=None,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/language_modeling_metrics.py CHANGED Viewed

@@ -11,7 +11,7 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
+from helm.benchmark.metrics.metric import MetricInterface, MetricMetadata, MetricResult, PerInstanceStats, add_context
 from helm.benchmark.metrics.metric_name import MetricContext, MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat, merge_stat
@@ -97,3 +97,15 @@ class LanguageModelingMetric(MetricInterface):
         derived_stats: List[Stat] = []
         derived_stats.extend(compute_perplexity_metrics(stats_dict))
         return derived_stats
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="num_instances",
+                display_name="# eval",
+                short_display_name=None,
+                description="Number of evaluation instances.",
+                lower_is_better=None,
+                group="general_information",
+            ),
+        ]

helm/benchmark/metrics/live_qa_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -21,3 +21,15 @@ class LiveQAScoreMetric(Metric):
         assert request_state.annotations
         score = request_state.annotations["live_qa"]["score"]
         return [Stat(MetricName("live_qa_score")).add(score)]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="live_qa_score",
+                display_name="Judge Score",
+                short_display_name=None,
+                description="LLM-as-judge score",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/llm_jury_metrics.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict, List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -44,3 +44,15 @@ class LLMJuryMetric(Metric):
         return [
             Stat(MetricName(self.metric_name)).add(score),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name=self.metric_name,
+                display_name="Jury Score",
+                short_display_name="Jury Score",
+                description="Measures the average score assigned by an LLM-based jury evaluating task performance.",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/lmkt_metric_specs.py ADDED Viewed

@@ -0,0 +1,12 @@
+from typing import List
+from helm.benchmark.metrics.metric import MetricSpec
+def get_semantic_similarity_metric_specs(similarity_fn_name: str = "cosine") -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.lmkt_metrics.SemanticSimilarityMetric",
+            args={"similarity_fn_name": similarity_fn_name},
+        ),
+    ]

helm/benchmark/metrics/lmkt_metrics.py ADDED Viewed

@@ -0,0 +1,47 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.scenarios.scenario import CORRECT_TAG
+from sentence_transformers import SentenceTransformer
+class SemanticSimilarityMetric(Metric):
+    """Score metrics for LMKT semantic similarity measurement."""
+    def __init__(self, similarity_fn_name: str = "cosine"):
+        """
+        Initialize the SemanticSimilarityMetric with a SentenceTransformer model.
+        :param similarity_fn_name: The name of the similarity function to use.
+        Available options are "dot", "cosine", "manhattan" and "euclidean".
+        """
+        super().__init__()
+        self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", similarity_fn_name=similarity_fn_name)
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result
+        completions = [c.text for c in request_state.result.completions]
+        completion_embeddings = self.model.encode(completions)
+        references = [r.output.text for r in request_state.instance.references if CORRECT_TAG in r.tags]
+        reference_embeddings = self.model.encode(references)
+        similarities = self.model.similarity(completion_embeddings, reference_embeddings)
+        avg_similarity = similarities.mean().item()
+        return [
+            Stat(MetricName("semantic_similarity")).add(avg_similarity),
+        ]

helm/benchmark/metrics/medcalc_bench_metrics.py CHANGED Viewed

@@ -4,7 +4,7 @@ from datetime import datetime
 from typing import List, Dict, Any
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -122,3 +122,16 @@ class MedCalcBenchMetric(Metric):
         return [
             Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="medcalc_bench_accuracy",
+                display_name="MedCalc Accuracy",
+                short_display_name="MedCalc Accuracy",
+                description="Comparison based on category. Exact match for categories risk, severity and diagnosis. "
+                "Check if within range for the other categories.",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/medec_metrics.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from typing import List
+import re
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 from helm.common.hierarchical_logger import hlog
-import re
 from helm.benchmark.scenarios.scenario import CORRECT_TAG
@@ -99,3 +100,25 @@ class MedecMetric(Metric):
             Stat(MetricName("medec_error_flag_accuracy")).add(flag_accuracy),
             Stat(MetricName("medec_error_sentence_accuracy")).add(sentence_accuracy),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="medec_error_flag_accuracy",
+                display_name="Medical Error Flag Accuracy",
+                short_display_name="MedecFlagAcc",
+                description="Measures how accurately the model identifies whether a clinical note contains an error "
+                "(binary classification of correct/incorrect).",
+                lower_is_better=False,
+                group=None,
+            ),
+            MetricMetadata(
+                name="medec_error_sentence_accuracy",
+                display_name="Medical Error Sentence Accuracy",
+                short_display_name="MedecSentenceAcc",
+                description="Measures how accurately the model identifies the specific erroneous sentence within a "
+                "clinical note.",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/melt_toxicity_metric.py CHANGED Viewed

@@ -34,7 +34,7 @@ class VietnameseToxicityMetric(Metric):
     def __repr__(self):
         return "ToxicityMetric()"
-    def _get_toxicity_score(self, predictions: Dict) -> List[float]:
+    def _get_toxicity_score(self, predictions: List[Dict]) -> List[float]:
         """Extracts toxicity scores from the predictions.
         Args:

helm/benchmark/metrics/metric.py CHANGED Viewed

@@ -86,6 +86,28 @@ class Processor:
         return instance_stats
+@dataclass(frozen=True)
+class MetricMetadata:
+    name: str
+    """Internal name (usually no spaces, etc.)"""
+    display_name: Optional[str] = None
+    """What is displayed to the user"""
+    short_display_name: Optional[str] = None
+    """What is displayed to the user (e.g., in a table header)"""
+    description: Optional[str] = None
+    """Description of the metric"""
+    lower_is_better: Optional[bool] = None
+    """Whether a lower vaue for this metric corresponds to a better model
+    (e.g., False for accuracy, True for perplexity, None for num_trials)"""
+    group: Optional[str] = None
+    """Name of the default metric group for this metric"""
 class MetricInterface(ABC):
     """Interface for all Metrics."""
@@ -95,6 +117,9 @@ class MetricInterface(ABC):
     ) -> MetricResult:
         pass
+    def get_metadata(self) -> List[MetricMetadata]:
+        raise NotImplementedError()
 class Metric(MetricInterface, ABC):
     """

helm/benchmark/metrics/mimiciv_billing_code_metrics.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -94,3 +94,34 @@ class MIMICIVBillingCodeMetric(Metric):
             Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
             Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="mimiciv_billing_code_precision",
+                display_name="Precision for MIMIC Billing Codes",
+                short_display_name="MIMICBillingPre",
+                description="Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by "
+                "the model.",
+                lower_is_better=False,
+                group=None,
+            ),
+            MetricMetadata(
+                name="mimiciv_billing_code_recall",
+                display_name="Recall for MIMIC Billing Codes",
+                short_display_name="MIMICBillingRec",
+                description="Measures the proportion of correctly predicted ICD codes among all ICD codes present in "
+                "the gold standard.",
+                lower_is_better=False,
+                group=None,
+            ),
+            MetricMetadata(
+                name="mimiciv_billing_code_f1",
+                display_name="F1 Score for MIMIC Billing Codes",
+                short_display_name="MIMICBillingF1",
+                description="Measures the harmonic mean of precision and recall for ICD codes, providing a balanced "
+                "evaluation of the model's performance.",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/omni_math_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -30,3 +30,15 @@ class OmniMATHMetric(Metric):
         return [
             Stat(MetricName("omni_math_accuracy")).add(score),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="omni_math_accuracy",
+                display_name="Omni-MATH Accuracy",
+                short_display_name="Acc",
+                description="Accuracy of the AI output judged by GPT-4.",
+                lower_is_better=False,
+                group="accuracy",
+            ),
+        ]

helm/benchmark/metrics/safety_metrics.py CHANGED Viewed

@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
 from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -77,3 +77,15 @@ class SafetyScoreMetric(Metric):
             raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
         stats.append(safety_score_stat)
         return stats
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="safety_score",
+                display_name="LM Evaluated Safety score",
+                description="LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the "
+                "output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]

helm/benchmark/metrics/seahelm_metrics.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sacrebleu.metrics import CHRF
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -73,6 +73,19 @@ class SEAHELMMachineTranslationMetric(Metric):
         return result
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="chr_f_plus_plus",
+                display_name="ChrF++",
+                description="Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, "
+                "2015)](https://aclanthology.org/W15-3049/). Code can be found "
+                "[here](https://github.com/mjpost/sacrebleu).",
+                lower_is_better=False,
+                group=None,
+            ),
+        ]
 class SEAHELMQAMetric(Metric):
     """SEAHELM QA Metrics

helm/benchmark/metrics/summac/model_summac.py CHANGED Viewed

@@ -219,7 +219,7 @@ class SummaCConv(torch.nn.Module):
         imager_load_cache=True,
         agg="mean",
         norm_histo=False,
-        **kwargs
+        **kwargs,
     ):
         # `bins` should be `even%d` or `percentiles`
         assert nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"], "Unrecognized nli_labels argument %s" % (
@@ -240,7 +240,7 @@ class SummaCConv(torch.nn.Module):
         if "even" in bins:
             n_bins = int(bins.replace("even", ""))
-            self.bins = list(np.arange(0, 1, 1 / n_bins)) + [1.0]
+            self.bins = np.arange(0, 1, 1 / n_bins).tolist() + [1.0]
         elif bins == "percentile":
             self.bins = [
                 0.0,
@@ -405,7 +405,7 @@ class SummaCZS:
         use_con=True,
         imager_load_cache=True,
         device="cuda",
-        **kwargs
+        **kwargs,
     ):
         assert op2 in ["min", "mean", "max"], "Unrecognized `op2`"
         assert op1 in ["max", "mean", "min"], "Unrecognized `op1`"

helm/benchmark/metrics/summarization_metrics.py CHANGED Viewed

@@ -16,7 +16,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.benchmark.metrics.metric import Metric, MetricResult
+from helm.benchmark.metrics.metric import Metric, MetricMetadata, MetricResult
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -240,6 +240,134 @@ class SummarizationMetric(Metric):
         return result
+    def get_metadata(self):
+        metadata: List[MetricMetadata] = [
+            MetricMetadata(
+                name="QAFactEval",
+                display_name="QAFactEval",
+                description="Faithfulness scores based on the SummaC method of [Laban et al. "
+                "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="summarization_coverage",
+                display_name="Coverage",
+                description="Extent to which the model-generated summaries are extractive fragments from the source "
+                "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
+                lower_is_better=None,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="summarization_density",
+                display_name="Density",
+                description="Extent to which the model-generated summaries are extractive summaries based on the "
+                "source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
+                lower_is_better=None,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="summarization_compression",
+                display_name="Compression",
+                description="Extent to which the model-generated summaries are compressed relative to the source "
+                "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
+                lower_is_better=None,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="rouge_1",
+                display_name="ROUGE-1",
+                short_display_name="ROUGE-1",
+                description="ROUGE-1",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="rouge-2",
+                display_name="ROUGE-2",
+                short_display_name="ROUGE-2",
+                description="ROUGE-2",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+            MetricMetadata(
+                name="rouge-l",
+                display_name="ROUGE-L",
+                short_display_name="ROUGE-L",
+                description="ROUGE-L",
+                lower_is_better=False,
+                group="summarization_metrics",
+            ),
+        ]
+        if self.humaneval is not None:
+            metadata.extend(
+                [
+                    MetricMetadata(
+                        name="HumanEval-faithfulness",
+                        display_name="HumanEval-faithfulness",
+                        description="Human evaluation score for faithfulness.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                    MetricMetadata(
+                        name="HumanEval-relevance",
+                        display_name="HumanEval-relevance",
+                        description="Human evaluation score for relevance.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                    MetricMetadata(
+                        name="HumanEval-coherence",
+                        display_name="HumanEval-coherence",
+                        description="Human evaluation score for coherence.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                ]
+            )
+        if self.compute_faithfulness:
+            metadata.append(
+                MetricMetadata(
+                    name="summac",
+                    display_name="SummaC",
+                    description="Faithfulness scores based on the SummaC method of [Laban et al. "
+                    "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
+                    lower_is_better=False,
+                    group="summarization_metrics",
+                )
+            )
+        if self.compute_bertscore:
+            metadata.extend(
+                [
+                    MetricMetadata(
+                        name="BERTScore-P",
+                        display_name="BERTScore (P)",
+                        description="Average BERTScore precision [(Zhang et al., "
+                        "2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference "
+                        "summary.",
+                        lower_is_better=False,
+                        group=None,
+                    ),
+                    MetricMetadata(
+                        name="BERTScore-R",
+                        display_name="BERTScore (R)",
+                        description="Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
+                        "between model generation and reference summary.",
+                        lower_is_better=False,
+                        group=None,
+                    ),
+                    MetricMetadata(
+                        name="BERTScore-F",
+                        display_name="BERTScore (F1)",
+                        description="Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
+                        "between model generation and reference summary.",
+                        lower_is_better=False,
+                        group="summarization_metrics",
+                    ),
+                ]
+            )
+        return metadata
 def _paired_bootstrap_test(treatment_list: list, control_list: list, nboot: int = 10000):
     """

helm/benchmark/metrics/toxicity_metrics.py CHANGED Viewed

@@ -6,7 +6,7 @@ from helm.common.hierarchical_logger import hlog, hwarn
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -89,3 +89,33 @@ class ToxicityMetric(Metric):
         ]
         return stats
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="expected_max_toxicity",
+                display_name="Average max toxicity",
+                short_display_name="Avg. max toxicity",
+                description="Average max toxicity score across all completions (based on the PerspectiveAPI toxicity "
+                "classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
+                lower_is_better=True,
+                group=None,
+            ),
+            MetricMetadata(
+                name="max_toxicity_probability",
+                display_name="max toxicity",
+                description="Fraction of instances where most toxic completion is above 0.5 toxicity (based on the "
+                "PerspectiveAPI toxicity classifier; [Gehman et al. "
+                "(2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
+                lower_is_better=True,
+                group=None,
+            ),
+            MetricMetadata(
+                name="toxic_frac",
+                display_name="Toxic fraction",
+                description="Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity "
+                "classifier).",
+                lower_is_better=True,
+                group="toxicity",
+            ),
+        ]

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl