PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py ADDED Viewed

@@ -0,0 +1,52 @@
+from typing import List
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
+from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+)
+from sklearn.metrics import f1_score, accuracy_score
+class UltraSuiteASRMetric(EvaluateInstancesMetric):
+    """Score metrics for UltraSuite ASR."""
+    def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
+        y_pred: List[str] = []
+        y_pred_quasi: List[str] = []
+        y_true: List[str] = []
+        for request_state in request_states:  # one request state per instance
+            for reference in request_state.instance.references:
+                if reference.tags == [CORRECT_TAG]:
+                    true_label = reference.output.text
+                    break
+            assert request_state.result
+            model_output_text = request_state.result.completions[0].text.strip().lower()
+            assert request_state.instance.extra_data
+            ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
+            if model_output_text == ground_truth_text:
+                predicted_label = "typically_developing"
+            else:
+                predicted_label = "speech_disorder"
+            if normalize_text(predicted_label) == normalize_text(true_label):
+                quasi_label = "typically_developing"
+            else:
+                quasi_label = "speech_disorder"
+            y_true.append(true_label)
+            y_pred.append(predicted_label)
+            y_pred_quasi.append(quasi_label)
+        return [
+            Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
+            Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
+            Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
+            Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
+        ]

helm/benchmark/metrics/wildbench_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric import Metric, MetricMetadata
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
             Stat(MetricName("wildbench_score")).add(score),
             Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
         ]
+    def get_metadata(self) -> List[MetricMetadata]:
+        return [
+            MetricMetadata(
+                name="wildbench_score",
+                display_name="WildBench Score",
+                short_display_name="WB Score",
+                description="Score of the AI output judged by GPT-4o.",
+                lower_is_better=False,
+                group="accuracy",
+            ),
+            MetricMetadata(
+                name="wildbench_score_rescaled",
+                display_name="WildBench Score",
+                short_display_name="WB Score",
+                description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+                lower_is_better=False,
+                group="accuracy",
+            ),
+        ]

helm/benchmark/model_deployment_registry.py CHANGED Viewed

@@ -157,12 +157,11 @@ def get_default_model_deployment_for_model(
     Example: "meta/llama-7b" => "together/llama-7b"
     The process to find a model deployment name is as follows:
-    1. If there is a model deployment with the same name as the model arg, use it.
-    2. If there is at least one deployment for the model, use the first one that is available.
-    3. If there are no deployments for the model, returns None.
+    1. If there is at least one deployment for the model, use the last one that is available.
+    2. If there are no deployments for the model, returns None.
     This function will also try to find a model deployment name that is not deprecated.
-    If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
+    If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
     If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
     If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
@@ -175,16 +174,7 @@ def get_default_model_deployment_for_model(
         ignore_deprecated: Whether to return None if the model deployment is deprecated.
     """
-    # If there is a model deployment with the same name as the model arg, use it.
-    if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
-        deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
-        if deployment.deprecated and ignore_deprecated:
-            if warn_arg_deprecated:
-                hwarn(f"Model deployment {model_name} is deprecated")
-            return None
-        return deployment.name
-    # If there is at least one deployment for the model, use the first one that is available.
+    # If there is at least one deployment for the model, use the last one that is available.
     available_deployments: List[ModelDeployment] = [
         deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
     ]
@@ -199,19 +189,21 @@ def get_default_model_deployment_for_model(
             deployment for deployment in available_deployments if not deployment.deprecated
         ]
         if len(non_deprecated_deployments) > 0:
-            chosen_deployment = non_deprecated_deployments[0]
+            chosen_deployment = non_deprecated_deployments[-1]
         # There are no non-deprecated deployments, so there are two options:
         # 1. If we can return an empty string, return it. (no model deployment is available)
-        # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
+        # 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
         elif ignore_deprecated:
             return None
-        else:
-            chosen_deployment = available_deployments[0]
+        elif len(available_deployments) > 0:
+            chosen_deployment = available_deployments[-1]
             if warn_arg_deprecated:
                 hwarn(f"All model deployments for model {model_name} are deprecated.")
+        else:
+            return None
         if warn_arg_deprecated:
             hlog(
-                f"Choosing {chosen_deployment.name} (the first one) as "
+                f"Choosing {chosen_deployment.name} (the last one) as "
                 f"the default model deployment for model {model_name}"
             )
             hlog("If you want to use a different model deployment, please specify it explicitly.")

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -1,4 +1,7 @@
-# mypy: check_untyped_defs = False
+# type: ignore
+# flake8: noqa
+# fmt: off
 import argparse
 from collections import defaultdict
 from dataclasses import dataclass
@@ -637,8 +640,14 @@ def main():
         default="png",
         choices=["png", "pdf"],
     )
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
     args = parser.parse_args()
-    setup_default_logging()
+    setup_default_logging(args.log_config)
     create_plots(args)

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 import os
+import re
 from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
 from helm.benchmark.adaptation.adapter_spec import (
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             if request_state.result is not None and request_state.result.completions
             else ""
         )
-        mapped_output = (
-            request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
-        )
+        mapped_output: Optional[str] = None
+        if request_state.output_mapping is not None:
+            output_to_map = predicted_text.strip()
+            if run_spec.adapter_spec.output_mapping_pattern:
+                match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
+                if not match:
+                    output_to_map = ""
+                elif match.groups():
+                    output_to_map = match.group(0)
+                else:
+                    output_to_map = match.string
+            mapped_output = request_state.output_mapping.get(output_to_map)
         instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
             request_state.instance
         )

helm/benchmark/presentation/run_entry.py CHANGED Viewed

@@ -14,10 +14,10 @@ class RunEntry:
     description: str
     # Priority for this run spec (1 is highest priority, 5 is lowest priority)
-    priority: int
+    priority: Optional[int] = None
     # Additional groups to add to the run spec
-    groups: Optional[List[str]]
+    groups: Optional[List[str]] = None
 @dataclass(frozen=True)

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -8,6 +8,7 @@ import mako.template
 import yaml
 import importlib_resources as resources
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import hlog
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
 NO_GROUPS = "no_groups"
-@dataclass(frozen=True)
-class TaxonomyInfo:
-    # Task (e.g., question answering)
-    task: Optional[str] = None
-    # Domain - genre (e.g., Wikipedia)
-    what: Optional[str] = None
-    # Domain - when it was written (e.g., 2010s)
-    when: Optional[str] = None
-    # Domain - demographics (e.g., web users)
-    who: Optional[str] = None
-    # Language (e.g., English)
-    language: Optional[str] = None
 @dataclass(frozen=True)
 class RunGroup(Field):
     """
@@ -205,22 +188,27 @@ class RunGroup(Field):
     # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
     adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
+    # Optional short description of the run group.
+    # This description is used in some space-constrained places in frontend tables.
+    # If unset, the description field will be used instead.
+    short_description: Optional[str] = None
 @dataclass
 class Schema:
     """Specifies information about what to display on the frontend."""
     # Information about each field
-    metrics: List[Field]
+    metrics: List[Field] = field(default_factory=list)
     # Information about each perturbation
-    perturbations: List[Field]
+    perturbations: List[Field] = field(default_factory=list)
     # Group the metrics
-    metric_groups: List[MetricGroup]
+    metric_groups: List[MetricGroup] = field(default_factory=list)
     # Group the scenarios
-    run_groups: List[RunGroup]
+    run_groups: List[RunGroup] = field(default_factory=list)
     # Adapter fields (e.g., temperature)
     # Automatically populated from the docstrings in the AdapterSpec class definition.

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -9,6 +9,7 @@ Usage:
 """
 import argparse
+import dataclasses
 import os
 import datetime
 import urllib.parse
@@ -31,18 +32,26 @@ from helm.common.general import (
 )
 from helm.common.codec import from_json
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
-from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric import get_all_stats_by_name
+from helm.benchmark.metrics.metric import (
+    MetricInterface,
+    MetricMetadata,
+    MetricSpec,
+    create_metric,
+    get_all_stats_by_name,
+)
 from helm.benchmark.metrics.statistic import Stat, merge_stat
 from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.runner import LATEST_SYMLINK
 from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
 from helm.benchmark.presentation.schema import (
+    MetricGroup,
     MetricNameMatcher,
     RunGroup,
     Field,
+    Schema,
     read_schema,
     get_default_schema_path,
     BY_GROUP,
@@ -294,7 +303,6 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
 class AggregationStrategy:
-    # TODO: Convert to StrEnum after upgrading to Python 3.11
     WIN_RATE = "win_rate"
     MEAN = "mean"
@@ -342,7 +350,7 @@ class Summarizer:
         release: Optional[str],
         suites: Optional[List[str]],
         suite: Optional[str],
-        schema_path: str,
+        schema_path: Optional[str],
         output_path: str,
         verbose: bool,
         num_threads: int,
@@ -377,10 +385,8 @@ class Summarizer:
         self.verbose: bool = verbose
         self.num_threads: int = num_threads
         self.allow_unknown_models: bool = allow_unknown_models
-        ensure_directory_exists(self.run_release_path)
-        self.schema = read_schema(schema_path)
+        self.schema = read_schema(schema_path) if schema_path else Schema()
+        self.metric_metadata: List[MetricMetadata] = []
     def read_run(self, run_path: str) -> Run:
         """Load the `Run` object from `run_path`."""
@@ -427,6 +433,8 @@ class Summarizer:
     def read_runs_for_suite(self, suite, run_suite_path):
         """Load the runs in the run suite path."""
+        if not os.path.exists(run_suite_path):
+            raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
         # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
         # so filter them out.
         run_dir_names = sorted(
@@ -510,6 +518,150 @@ class Summarizer:
             model_field_dicts.append(asdict_without_nones(model_field))
         return model_field_dicts
+    def get_metric_metadata(self) -> List[MetricMetadata]:
+        if self.metric_metadata:
+            return self.metric_metadata
+        metric_specs: List[MetricSpec] = []
+        for run in self.runs:
+            metric_specs.extend(run.run_spec.metric_specs)
+        metric_specs = list(set(metric_specs))
+        metric_name_to_metadata: Dict[str, MetricMetadata] = {}
+        for metric_spec in metric_specs:
+            try:
+                metric: MetricInterface = create_metric(metric_spec)
+                metric_metadata_list = metric.get_metadata()
+                for metric_metadata in metric_metadata_list:
+                    metric_name_to_metadata[metric_metadata.name] = metric_metadata
+            except NotImplementedError:
+                pass
+            except (ModuleNotFoundError, AttributeError, TypeError):
+                pass
+        run_stat_names: Set[str] = set()
+        for run in self.runs:
+            for stat in run.stats:
+                run_stat_names.add(stat.name.name)
+        metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
+        for metric_name_to_prune in metric_names_to_prune:
+            del metric_name_to_metadata[metric_name_to_prune]
+        self.metric_metadata = list(metric_name_to_metadata.values())
+        return self.metric_metadata
+    def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
+        return Field(
+            name=metric_metadata.name,
+            display_name=metric_metadata.display_name,
+            short_display_name=metric_metadata.short_display_name,
+            description=metric_metadata.description,
+            lower_is_better=metric_metadata.lower_is_better,
+        )
+    def auto_generate_metric_fields(self) -> List[Field]:
+        return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
+    def auto_generate_metric_groups(self) -> List[MetricGroup]:
+        metric_groups = [
+            MetricGroup(
+                name="main_metric",
+                display_name="Main Metric",
+                description="Main Metric",
+                metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
+            )
+        ]
+        metric_group_to_metrics: Dict[str, List[str]] = {}
+        for metric_metadata in self.metric_metadata:
+            if metric_metadata.group:
+                if metric_metadata.group not in metric_group_to_metrics:
+                    metric_group_to_metrics[metric_metadata.group] = []
+                metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
+        for metric_group, metric_names in metric_group_to_metrics.items():
+            display_name = metric_group.replace("_", " ").capitalize()
+            metric_groups.append(
+                MetricGroup(
+                    name=metric_group,
+                    # TODO: Make display_name and description nicer
+                    display_name=display_name,
+                    description=display_name,
+                    aggregation_strategies=[],
+                    metrics=[
+                        MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
+                    ],
+                )
+            )
+        return metric_groups
+    def get_scenario_metadata(self) -> List[ScenarioMetadata]:
+        scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
+        scenario_specs = list(set(scenario_specs))
+        scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
+        for scenario_spec in scenario_specs:
+            try:
+                scenario: Scenario = create_scenario(scenario_spec)
+                scenario_metadata = scenario.get_metadata()
+                scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
+            except NotImplementedError:
+                pass
+            except (ModuleNotFoundError, AttributeError, TypeError):
+                pass
+        run_groups: Set[str] = set()
+        for run in self.runs:
+            for run_group in run.run_spec.groups:
+                run_groups.add(run_group)
+        scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
+        for scenario_name_to_prune in scenario_names_to_prune:
+            del scenario_name_to_metadata[scenario_name_to_prune]
+        return list(scenario_name_to_metadata.values())
+    def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
+        metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
+        return RunGroup(
+            name=scenario_metadata.name,
+            display_name=scenario_metadata.display_name,
+            short_display_name=scenario_metadata.short_display_name,
+            description=scenario_metadata.description,
+            metric_groups=metric_group_names,
+            environment={
+                "main_name": scenario_metadata.main_metric,
+                "main_split": scenario_metadata.main_split,
+            },
+            taxonomy=scenario_metadata.taxonomy,
+        )
+    def auto_generate_all_scenarios_run_group(self) -> RunGroup:
+        return RunGroup(
+            name="all_scenarios",
+            display_name="All Scenarios",
+            description="All scenarios",
+            category="Scenario Groups",
+            subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
+        )
+    def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
+        return [
+            self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
+        ]
+    def fix_up_schema(self) -> None:
+        # if not self.schema.run_groups:
+        if not self.schema.metrics:
+            self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
+            # Can only auto-generate metric groups if metrics were also auto-generated
+            # because auto_generate_metric_groups() requires self.metric_metadata()
+            # which is populated by auto_generate_metric_fields()
+            if not self.schema.metric_groups:
+                self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
+        if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
+            self.schema = dataclasses.replace(
+                self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
+            )
+        if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
+            self.schema = dataclasses.replace(
+                self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
+            )
     def write_schema(self) -> None:
         """Write the schema file to benchmark_output so the frontend knows about it."""
         # Manually add the model metadata to the schema.json, where the frontend expects it.
@@ -839,7 +991,8 @@ class Summarizer:
                 }
                 header_name = header_field.get_short_display_name()
-                description = (run_group.description + "\n\n" if run_group.description is not None else "") + (
+                run_group_short_description = run_group.short_description or run_group.description or ""
+                description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
                     (header_field.display_name if header_field.display_name else header_field.name)
                     + ": "
                     + (header_field.description if header_field.description is not None else "")
@@ -1070,7 +1223,8 @@ class Summarizer:
                     is_scenario_table=False,
                     aggregation_strategies=aggregate_strategies,
                 )
-                tables.append(table)
+                if len(table.header) > 1:
+                    tables.append(table)
         return tables
     def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1213,14 +1367,16 @@ class Summarizer:
         """Run the entire summarization pipeline."""
         self.read_runs()
         self.group_runs()
-        self.check_metrics_defined()
-        self.write_run_display_json(skip_completed)
+        ensure_directory_exists(self.run_release_path)
         # Must happen after self.read_runs()
         # because it uses self.runs
+        self.fix_up_schema()
+        self.check_metrics_defined()
         self.write_schema()
+        self.write_run_display_json(skip_completed)
         self.write_executive_summary()
         self.write_runs()
         self.write_run_specs()
@@ -1254,7 +1410,15 @@ def summarize(args):
     else:
         raise ValueError("Exactly one of --release or --suite must be specified.")
-    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
+    schema_path: Optional[str]
+    if args.auto_generate_schema:
+        if args.schema_path:
+            raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
+        schema_path = None
+    elif args.schema_path:
+        schema_path = args.schema_path
+    else:
+        schema_path = get_default_schema_path()
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
@@ -1340,8 +1504,19 @@ def main():
         default=None,
         help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
     )
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
+    parser.add_argument(
+        "--auto-generate-schema",
+        action="store_true",
+        help="EXPERIMENTAL: Auto-generate schema",
+    )
     args = parser.parse_args()
-    setup_default_logging()
+    setup_default_logging(args.log_config)
     summarize(args)

helm/benchmark/presentation/taxonomy_info.py ADDED Viewed

@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from typing import Optional
+@dataclass(frozen=True)
+class TaxonomyInfo:
+    # Task (e.g., question answering)
+    task: Optional[str] = None
+    # Domain - genre (e.g., Wikipedia)
+    what: Optional[str] = None
+    # Domain - when it was written (e.g., 2010s)
+    when: Optional[str] = None
+    # Domain - demographics (e.g., web users)
+    who: Optional[str] = None
+    # Language (e.g., English)
+    language: Optional[str] = None

helm/benchmark/presentation/test_create_plots.py CHANGED Viewed

@@ -1,4 +1,7 @@
-# mypy: check_untyped_defs = False
+# type: ignore
+# flake8: noqa
+# fmt: off
 from helm.common.general import asdict_without_nones
 from helm.benchmark.presentation.table import Table, Cell, HeaderCell
 from helm.benchmark.presentation.create_plots import parse_table

helm/benchmark/run.py CHANGED Viewed

@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
     run_specs: List[RunSpec] = []
     for entry in run_entries:
         # Filter by priority
-        if priority is not None and entry.priority > priority:
+        if priority is not None and entry.priority is not None and entry.priority > priority:
             continue
         for run_spec in construct_run_specs(parse_object_spec(entry.description)):
@@ -298,8 +298,7 @@ def helm_run(args):
     hlog("Done.")
-# Separate parsing from starting HELM so we can setup logging
-def main():
+def build_parser():
     parser = argparse.ArgumentParser()
     add_service_args(parser)
     parser.add_argument(
@@ -365,9 +364,21 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
     add_run_args(parser)
+    return parser
+# Separate parsing from starting HELM so we can setup logging
+def main():
+    parser = build_parser()
     args = parser.parse_args()
-    setup_default_logging()
+    setup_default_logging(args.log_config)
     return helm_run(args)

helm/benchmark/run_expander.py CHANGED Viewed

@@ -1484,6 +1484,8 @@ class OutputFormatInstructions(RunExpander):
                 instructions = "Answer with only a single letter. Do not include a period in your answer."
             elif self.scenario == "mcqa_only_last_question":
                 instructions = "Answer only the last question with only a single letter."
+            elif self.scenario == "arabic_mcqa":
+                instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
             else:
                 instructions = "Answer with only a single letter."
         elif run_spec.adapter_spec.method == ADAPT_GENERATION:
@@ -1525,6 +1527,8 @@ class OutputFormatInstructions(RunExpander):
                     "Answer only the last question with a short answer. "
                     "Avoid extra, unnecessary information in the answer."
                 )
+            elif self.scenario == "arabic_mcqa":
+                instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
             else:
                 raise ValueError(f"Unknown scenario {self.scenario}")
         elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl