crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
5
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
|
|
7
|
+
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
)
|
|
11
|
+
from sklearn.metrics import f1_score, accuracy_score
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UltraSuiteASRMetric(EvaluateInstancesMetric):
|
|
15
|
+
"""Score metrics for UltraSuite ASR."""
|
|
16
|
+
|
|
17
|
+
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
18
|
+
y_pred: List[str] = []
|
|
19
|
+
y_pred_quasi: List[str] = []
|
|
20
|
+
y_true: List[str] = []
|
|
21
|
+
for request_state in request_states: # one request state per instance
|
|
22
|
+
|
|
23
|
+
for reference in request_state.instance.references:
|
|
24
|
+
if reference.tags == [CORRECT_TAG]:
|
|
25
|
+
true_label = reference.output.text
|
|
26
|
+
break
|
|
27
|
+
|
|
28
|
+
assert request_state.result
|
|
29
|
+
model_output_text = request_state.result.completions[0].text.strip().lower()
|
|
30
|
+
assert request_state.instance.extra_data
|
|
31
|
+
ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
|
|
32
|
+
|
|
33
|
+
if model_output_text == ground_truth_text:
|
|
34
|
+
predicted_label = "typically_developing"
|
|
35
|
+
else:
|
|
36
|
+
predicted_label = "speech_disorder"
|
|
37
|
+
|
|
38
|
+
if normalize_text(predicted_label) == normalize_text(true_label):
|
|
39
|
+
quasi_label = "typically_developing"
|
|
40
|
+
else:
|
|
41
|
+
quasi_label = "speech_disorder"
|
|
42
|
+
|
|
43
|
+
y_true.append(true_label)
|
|
44
|
+
y_pred.append(predicted_label)
|
|
45
|
+
y_pred_quasi.append(quasi_label)
|
|
46
|
+
|
|
47
|
+
return [
|
|
48
|
+
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
49
|
+
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
50
|
+
Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
|
|
51
|
+
Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
|
|
52
|
+
]
|
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.metrics.metric import Metric
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
6
6
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
7
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
|
|
|
32
32
|
Stat(MetricName("wildbench_score")).add(score),
|
|
33
33
|
Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
|
|
34
34
|
]
|
|
35
|
+
|
|
36
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
37
|
+
return [
|
|
38
|
+
MetricMetadata(
|
|
39
|
+
name="wildbench_score",
|
|
40
|
+
display_name="WildBench Score",
|
|
41
|
+
short_display_name="WB Score",
|
|
42
|
+
description="Score of the AI output judged by GPT-4o.",
|
|
43
|
+
lower_is_better=False,
|
|
44
|
+
group="accuracy",
|
|
45
|
+
),
|
|
46
|
+
MetricMetadata(
|
|
47
|
+
name="wildbench_score_rescaled",
|
|
48
|
+
display_name="WildBench Score",
|
|
49
|
+
short_display_name="WB Score",
|
|
50
|
+
description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
|
|
51
|
+
lower_is_better=False,
|
|
52
|
+
group="accuracy",
|
|
53
|
+
),
|
|
54
|
+
]
|
|
@@ -157,12 +157,11 @@ def get_default_model_deployment_for_model(
|
|
|
157
157
|
Example: "meta/llama-7b" => "together/llama-7b"
|
|
158
158
|
|
|
159
159
|
The process to find a model deployment name is as follows:
|
|
160
|
-
1. If there is
|
|
161
|
-
2. If there
|
|
162
|
-
3. If there are no deployments for the model, returns None.
|
|
160
|
+
1. If there is at least one deployment for the model, use the last one that is available.
|
|
161
|
+
2. If there are no deployments for the model, returns None.
|
|
163
162
|
|
|
164
163
|
This function will also try to find a model deployment name that is not deprecated.
|
|
165
|
-
If there are no non-deprecated deployments, it will return the
|
|
164
|
+
If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
|
|
166
165
|
If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
|
|
167
166
|
|
|
168
167
|
If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
|
|
@@ -175,16 +174,7 @@ def get_default_model_deployment_for_model(
|
|
|
175
174
|
ignore_deprecated: Whether to return None if the model deployment is deprecated.
|
|
176
175
|
"""
|
|
177
176
|
|
|
178
|
-
# If there is
|
|
179
|
-
if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
|
|
180
|
-
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
|
|
181
|
-
if deployment.deprecated and ignore_deprecated:
|
|
182
|
-
if warn_arg_deprecated:
|
|
183
|
-
hwarn(f"Model deployment {model_name} is deprecated")
|
|
184
|
-
return None
|
|
185
|
-
return deployment.name
|
|
186
|
-
|
|
187
|
-
# If there is at least one deployment for the model, use the first one that is available.
|
|
177
|
+
# If there is at least one deployment for the model, use the last one that is available.
|
|
188
178
|
available_deployments: List[ModelDeployment] = [
|
|
189
179
|
deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
|
|
190
180
|
]
|
|
@@ -199,19 +189,21 @@ def get_default_model_deployment_for_model(
|
|
|
199
189
|
deployment for deployment in available_deployments if not deployment.deprecated
|
|
200
190
|
]
|
|
201
191
|
if len(non_deprecated_deployments) > 0:
|
|
202
|
-
chosen_deployment = non_deprecated_deployments[
|
|
192
|
+
chosen_deployment = non_deprecated_deployments[-1]
|
|
203
193
|
# There are no non-deprecated deployments, so there are two options:
|
|
204
194
|
# 1. If we can return an empty string, return it. (no model deployment is available)
|
|
205
|
-
# 2. If we can't return an empty string, return the
|
|
195
|
+
# 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
|
|
206
196
|
elif ignore_deprecated:
|
|
207
197
|
return None
|
|
208
|
-
|
|
209
|
-
chosen_deployment = available_deployments[
|
|
198
|
+
elif len(available_deployments) > 0:
|
|
199
|
+
chosen_deployment = available_deployments[-1]
|
|
210
200
|
if warn_arg_deprecated:
|
|
211
201
|
hwarn(f"All model deployments for model {model_name} are deprecated.")
|
|
202
|
+
else:
|
|
203
|
+
return None
|
|
212
204
|
if warn_arg_deprecated:
|
|
213
205
|
hlog(
|
|
214
|
-
f"Choosing {chosen_deployment.name} (the
|
|
206
|
+
f"Choosing {chosen_deployment.name} (the last one) as "
|
|
215
207
|
f"the default model deployment for model {model_name}"
|
|
216
208
|
)
|
|
217
209
|
hlog("If you want to use a different model deployment, please specify it explicitly.")
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
#
|
|
1
|
+
# type: ignore
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
2
5
|
import argparse
|
|
3
6
|
from collections import defaultdict
|
|
4
7
|
from dataclasses import dataclass
|
|
@@ -637,8 +640,14 @@ def main():
|
|
|
637
640
|
default="png",
|
|
638
641
|
choices=["png", "pdf"],
|
|
639
642
|
)
|
|
643
|
+
parser.add_argument(
|
|
644
|
+
"--log-config",
|
|
645
|
+
type=str,
|
|
646
|
+
default=None,
|
|
647
|
+
help="PATH to a YAML file to customize logging",
|
|
648
|
+
)
|
|
640
649
|
args = parser.parse_args()
|
|
641
|
-
setup_default_logging()
|
|
650
|
+
setup_default_logging(args.log_config)
|
|
642
651
|
create_plots(args)
|
|
643
652
|
|
|
644
653
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
|
|
5
6
|
|
|
6
7
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
262
263
|
if request_state.result is not None and request_state.result.completions
|
|
263
264
|
else ""
|
|
264
265
|
)
|
|
265
|
-
mapped_output =
|
|
266
|
-
|
|
267
|
-
|
|
266
|
+
mapped_output: Optional[str] = None
|
|
267
|
+
if request_state.output_mapping is not None:
|
|
268
|
+
output_to_map = predicted_text.strip()
|
|
269
|
+
if run_spec.adapter_spec.output_mapping_pattern:
|
|
270
|
+
match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
|
|
271
|
+
if not match:
|
|
272
|
+
output_to_map = ""
|
|
273
|
+
elif match.groups():
|
|
274
|
+
output_to_map = match.group(0)
|
|
275
|
+
else:
|
|
276
|
+
output_to_map = match.string
|
|
277
|
+
mapped_output = request_state.output_mapping.get(output_to_map)
|
|
268
278
|
instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
|
|
269
279
|
request_state.instance
|
|
270
280
|
)
|
|
@@ -14,10 +14,10 @@ class RunEntry:
|
|
|
14
14
|
description: str
|
|
15
15
|
|
|
16
16
|
# Priority for this run spec (1 is highest priority, 5 is lowest priority)
|
|
17
|
-
priority: int
|
|
17
|
+
priority: Optional[int] = None
|
|
18
18
|
|
|
19
19
|
# Additional groups to add to the run spec
|
|
20
|
-
groups: Optional[List[str]]
|
|
20
|
+
groups: Optional[List[str]] = None
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@dataclass(frozen=True)
|
|
@@ -8,6 +8,7 @@ import mako.template
|
|
|
8
8
|
import yaml
|
|
9
9
|
import importlib_resources as resources
|
|
10
10
|
|
|
11
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
11
12
|
from helm.common.general import hlog
|
|
12
13
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
14
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
|
|
|
131
132
|
NO_GROUPS = "no_groups"
|
|
132
133
|
|
|
133
134
|
|
|
134
|
-
@dataclass(frozen=True)
|
|
135
|
-
class TaxonomyInfo:
|
|
136
|
-
# Task (e.g., question answering)
|
|
137
|
-
task: Optional[str] = None
|
|
138
|
-
|
|
139
|
-
# Domain - genre (e.g., Wikipedia)
|
|
140
|
-
what: Optional[str] = None
|
|
141
|
-
|
|
142
|
-
# Domain - when it was written (e.g., 2010s)
|
|
143
|
-
when: Optional[str] = None
|
|
144
|
-
|
|
145
|
-
# Domain - demographics (e.g., web users)
|
|
146
|
-
who: Optional[str] = None
|
|
147
|
-
|
|
148
|
-
# Language (e.g., English)
|
|
149
|
-
language: Optional[str] = None
|
|
150
|
-
|
|
151
|
-
|
|
152
135
|
@dataclass(frozen=True)
|
|
153
136
|
class RunGroup(Field):
|
|
154
137
|
"""
|
|
@@ -205,22 +188,27 @@ class RunGroup(Field):
|
|
|
205
188
|
# TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
|
|
206
189
|
adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
|
|
207
190
|
|
|
191
|
+
# Optional short description of the run group.
|
|
192
|
+
# This description is used in some space-constrained places in frontend tables.
|
|
193
|
+
# If unset, the description field will be used instead.
|
|
194
|
+
short_description: Optional[str] = None
|
|
195
|
+
|
|
208
196
|
|
|
209
197
|
@dataclass
|
|
210
198
|
class Schema:
|
|
211
199
|
"""Specifies information about what to display on the frontend."""
|
|
212
200
|
|
|
213
201
|
# Information about each field
|
|
214
|
-
metrics: List[Field]
|
|
202
|
+
metrics: List[Field] = field(default_factory=list)
|
|
215
203
|
|
|
216
204
|
# Information about each perturbation
|
|
217
|
-
perturbations: List[Field]
|
|
205
|
+
perturbations: List[Field] = field(default_factory=list)
|
|
218
206
|
|
|
219
207
|
# Group the metrics
|
|
220
|
-
metric_groups: List[MetricGroup]
|
|
208
|
+
metric_groups: List[MetricGroup] = field(default_factory=list)
|
|
221
209
|
|
|
222
210
|
# Group the scenarios
|
|
223
|
-
run_groups: List[RunGroup]
|
|
211
|
+
run_groups: List[RunGroup] = field(default_factory=list)
|
|
224
212
|
|
|
225
213
|
# Adapter fields (e.g., temperature)
|
|
226
214
|
# Automatically populated from the docstrings in the AdapterSpec class definition.
|
|
@@ -9,6 +9,7 @@ Usage:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
+
import dataclasses
|
|
12
13
|
import os
|
|
13
14
|
import datetime
|
|
14
15
|
import urllib.parse
|
|
@@ -31,18 +32,26 @@ from helm.common.general import (
|
|
|
31
32
|
)
|
|
32
33
|
from helm.common.codec import from_json
|
|
33
34
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
34
|
-
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
35
|
+
from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
|
|
35
36
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
36
37
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
37
|
-
from helm.benchmark.metrics.metric import
|
|
38
|
+
from helm.benchmark.metrics.metric import (
|
|
39
|
+
MetricInterface,
|
|
40
|
+
MetricMetadata,
|
|
41
|
+
MetricSpec,
|
|
42
|
+
create_metric,
|
|
43
|
+
get_all_stats_by_name,
|
|
44
|
+
)
|
|
38
45
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
39
46
|
from helm.benchmark.run_spec import RunSpec
|
|
40
47
|
from helm.benchmark.runner import LATEST_SYMLINK
|
|
41
48
|
from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
|
|
42
49
|
from helm.benchmark.presentation.schema import (
|
|
50
|
+
MetricGroup,
|
|
43
51
|
MetricNameMatcher,
|
|
44
52
|
RunGroup,
|
|
45
53
|
Field,
|
|
54
|
+
Schema,
|
|
46
55
|
read_schema,
|
|
47
56
|
get_default_schema_path,
|
|
48
57
|
BY_GROUP,
|
|
@@ -294,7 +303,6 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
|
294
303
|
|
|
295
304
|
|
|
296
305
|
class AggregationStrategy:
|
|
297
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
298
306
|
WIN_RATE = "win_rate"
|
|
299
307
|
MEAN = "mean"
|
|
300
308
|
|
|
@@ -342,7 +350,7 @@ class Summarizer:
|
|
|
342
350
|
release: Optional[str],
|
|
343
351
|
suites: Optional[List[str]],
|
|
344
352
|
suite: Optional[str],
|
|
345
|
-
schema_path: str,
|
|
353
|
+
schema_path: Optional[str],
|
|
346
354
|
output_path: str,
|
|
347
355
|
verbose: bool,
|
|
348
356
|
num_threads: int,
|
|
@@ -377,10 +385,8 @@ class Summarizer:
|
|
|
377
385
|
self.verbose: bool = verbose
|
|
378
386
|
self.num_threads: int = num_threads
|
|
379
387
|
self.allow_unknown_models: bool = allow_unknown_models
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
self.schema = read_schema(schema_path)
|
|
388
|
+
self.schema = read_schema(schema_path) if schema_path else Schema()
|
|
389
|
+
self.metric_metadata: List[MetricMetadata] = []
|
|
384
390
|
|
|
385
391
|
def read_run(self, run_path: str) -> Run:
|
|
386
392
|
"""Load the `Run` object from `run_path`."""
|
|
@@ -427,6 +433,8 @@ class Summarizer:
|
|
|
427
433
|
|
|
428
434
|
def read_runs_for_suite(self, suite, run_suite_path):
|
|
429
435
|
"""Load the runs in the run suite path."""
|
|
436
|
+
if not os.path.exists(run_suite_path):
|
|
437
|
+
raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
|
|
430
438
|
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
|
|
431
439
|
# so filter them out.
|
|
432
440
|
run_dir_names = sorted(
|
|
@@ -510,6 +518,150 @@ class Summarizer:
|
|
|
510
518
|
model_field_dicts.append(asdict_without_nones(model_field))
|
|
511
519
|
return model_field_dicts
|
|
512
520
|
|
|
521
|
+
def get_metric_metadata(self) -> List[MetricMetadata]:
|
|
522
|
+
if self.metric_metadata:
|
|
523
|
+
return self.metric_metadata
|
|
524
|
+
metric_specs: List[MetricSpec] = []
|
|
525
|
+
for run in self.runs:
|
|
526
|
+
metric_specs.extend(run.run_spec.metric_specs)
|
|
527
|
+
metric_specs = list(set(metric_specs))
|
|
528
|
+
metric_name_to_metadata: Dict[str, MetricMetadata] = {}
|
|
529
|
+
for metric_spec in metric_specs:
|
|
530
|
+
try:
|
|
531
|
+
metric: MetricInterface = create_metric(metric_spec)
|
|
532
|
+
metric_metadata_list = metric.get_metadata()
|
|
533
|
+
for metric_metadata in metric_metadata_list:
|
|
534
|
+
metric_name_to_metadata[metric_metadata.name] = metric_metadata
|
|
535
|
+
except NotImplementedError:
|
|
536
|
+
pass
|
|
537
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
538
|
+
pass
|
|
539
|
+
|
|
540
|
+
run_stat_names: Set[str] = set()
|
|
541
|
+
for run in self.runs:
|
|
542
|
+
for stat in run.stats:
|
|
543
|
+
run_stat_names.add(stat.name.name)
|
|
544
|
+
|
|
545
|
+
metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
|
|
546
|
+
for metric_name_to_prune in metric_names_to_prune:
|
|
547
|
+
del metric_name_to_metadata[metric_name_to_prune]
|
|
548
|
+
self.metric_metadata = list(metric_name_to_metadata.values())
|
|
549
|
+
return self.metric_metadata
|
|
550
|
+
|
|
551
|
+
def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
|
|
552
|
+
return Field(
|
|
553
|
+
name=metric_metadata.name,
|
|
554
|
+
display_name=metric_metadata.display_name,
|
|
555
|
+
short_display_name=metric_metadata.short_display_name,
|
|
556
|
+
description=metric_metadata.description,
|
|
557
|
+
lower_is_better=metric_metadata.lower_is_better,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
def auto_generate_metric_fields(self) -> List[Field]:
|
|
561
|
+
return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
|
|
562
|
+
|
|
563
|
+
def auto_generate_metric_groups(self) -> List[MetricGroup]:
|
|
564
|
+
metric_groups = [
|
|
565
|
+
MetricGroup(
|
|
566
|
+
name="main_metric",
|
|
567
|
+
display_name="Main Metric",
|
|
568
|
+
description="Main Metric",
|
|
569
|
+
metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
|
|
570
|
+
)
|
|
571
|
+
]
|
|
572
|
+
metric_group_to_metrics: Dict[str, List[str]] = {}
|
|
573
|
+
for metric_metadata in self.metric_metadata:
|
|
574
|
+
if metric_metadata.group:
|
|
575
|
+
if metric_metadata.group not in metric_group_to_metrics:
|
|
576
|
+
metric_group_to_metrics[metric_metadata.group] = []
|
|
577
|
+
metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
|
|
578
|
+
for metric_group, metric_names in metric_group_to_metrics.items():
|
|
579
|
+
display_name = metric_group.replace("_", " ").capitalize()
|
|
580
|
+
metric_groups.append(
|
|
581
|
+
MetricGroup(
|
|
582
|
+
name=metric_group,
|
|
583
|
+
# TODO: Make display_name and description nicer
|
|
584
|
+
display_name=display_name,
|
|
585
|
+
description=display_name,
|
|
586
|
+
aggregation_strategies=[],
|
|
587
|
+
metrics=[
|
|
588
|
+
MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
|
|
589
|
+
],
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
return metric_groups
|
|
593
|
+
|
|
594
|
+
def get_scenario_metadata(self) -> List[ScenarioMetadata]:
|
|
595
|
+
scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
|
|
596
|
+
scenario_specs = list(set(scenario_specs))
|
|
597
|
+
scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
|
|
598
|
+
for scenario_spec in scenario_specs:
|
|
599
|
+
try:
|
|
600
|
+
scenario: Scenario = create_scenario(scenario_spec)
|
|
601
|
+
scenario_metadata = scenario.get_metadata()
|
|
602
|
+
scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
|
|
603
|
+
except NotImplementedError:
|
|
604
|
+
pass
|
|
605
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
run_groups: Set[str] = set()
|
|
609
|
+
for run in self.runs:
|
|
610
|
+
for run_group in run.run_spec.groups:
|
|
611
|
+
run_groups.add(run_group)
|
|
612
|
+
|
|
613
|
+
scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
|
|
614
|
+
for scenario_name_to_prune in scenario_names_to_prune:
|
|
615
|
+
del scenario_name_to_metadata[scenario_name_to_prune]
|
|
616
|
+
return list(scenario_name_to_metadata.values())
|
|
617
|
+
|
|
618
|
+
def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
|
|
619
|
+
metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
|
|
620
|
+
return RunGroup(
|
|
621
|
+
name=scenario_metadata.name,
|
|
622
|
+
display_name=scenario_metadata.display_name,
|
|
623
|
+
short_display_name=scenario_metadata.short_display_name,
|
|
624
|
+
description=scenario_metadata.description,
|
|
625
|
+
metric_groups=metric_group_names,
|
|
626
|
+
environment={
|
|
627
|
+
"main_name": scenario_metadata.main_metric,
|
|
628
|
+
"main_split": scenario_metadata.main_split,
|
|
629
|
+
},
|
|
630
|
+
taxonomy=scenario_metadata.taxonomy,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
def auto_generate_all_scenarios_run_group(self) -> RunGroup:
|
|
634
|
+
return RunGroup(
|
|
635
|
+
name="all_scenarios",
|
|
636
|
+
display_name="All Scenarios",
|
|
637
|
+
description="All scenarios",
|
|
638
|
+
category="Scenario Groups",
|
|
639
|
+
subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
|
|
643
|
+
return [
|
|
644
|
+
self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
def fix_up_schema(self) -> None:
|
|
648
|
+
# if not self.schema.run_groups:
|
|
649
|
+
if not self.schema.metrics:
|
|
650
|
+
self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
|
|
651
|
+
# Can only auto-generate metric groups if metrics were also auto-generated
|
|
652
|
+
# because auto_generate_metric_groups() requires self.metric_metadata()
|
|
653
|
+
# which is populated by auto_generate_metric_fields()
|
|
654
|
+
if not self.schema.metric_groups:
|
|
655
|
+
self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
|
|
656
|
+
if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
|
|
657
|
+
self.schema = dataclasses.replace(
|
|
658
|
+
self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
|
|
659
|
+
)
|
|
660
|
+
if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
|
|
661
|
+
self.schema = dataclasses.replace(
|
|
662
|
+
self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
|
|
663
|
+
)
|
|
664
|
+
|
|
513
665
|
def write_schema(self) -> None:
|
|
514
666
|
"""Write the schema file to benchmark_output so the frontend knows about it."""
|
|
515
667
|
# Manually add the model metadata to the schema.json, where the frontend expects it.
|
|
@@ -839,7 +991,8 @@ class Summarizer:
|
|
|
839
991
|
}
|
|
840
992
|
|
|
841
993
|
header_name = header_field.get_short_display_name()
|
|
842
|
-
|
|
994
|
+
run_group_short_description = run_group.short_description or run_group.description or ""
|
|
995
|
+
description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
|
|
843
996
|
(header_field.display_name if header_field.display_name else header_field.name)
|
|
844
997
|
+ ": "
|
|
845
998
|
+ (header_field.description if header_field.description is not None else "")
|
|
@@ -1070,7 +1223,8 @@ class Summarizer:
|
|
|
1070
1223
|
is_scenario_table=False,
|
|
1071
1224
|
aggregation_strategies=aggregate_strategies,
|
|
1072
1225
|
)
|
|
1073
|
-
|
|
1226
|
+
if len(table.header) > 1:
|
|
1227
|
+
tables.append(table)
|
|
1074
1228
|
return tables
|
|
1075
1229
|
|
|
1076
1230
|
def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
|
|
@@ -1213,14 +1367,16 @@ class Summarizer:
|
|
|
1213
1367
|
"""Run the entire summarization pipeline."""
|
|
1214
1368
|
self.read_runs()
|
|
1215
1369
|
self.group_runs()
|
|
1216
|
-
self.check_metrics_defined()
|
|
1217
1370
|
|
|
1218
|
-
self.
|
|
1371
|
+
ensure_directory_exists(self.run_release_path)
|
|
1219
1372
|
|
|
1220
1373
|
# Must happen after self.read_runs()
|
|
1221
1374
|
# because it uses self.runs
|
|
1375
|
+
self.fix_up_schema()
|
|
1376
|
+
self.check_metrics_defined()
|
|
1222
1377
|
self.write_schema()
|
|
1223
1378
|
|
|
1379
|
+
self.write_run_display_json(skip_completed)
|
|
1224
1380
|
self.write_executive_summary()
|
|
1225
1381
|
self.write_runs()
|
|
1226
1382
|
self.write_run_specs()
|
|
@@ -1254,7 +1410,15 @@ def summarize(args):
|
|
|
1254
1410
|
else:
|
|
1255
1411
|
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1256
1412
|
|
|
1257
|
-
schema_path
|
|
1413
|
+
schema_path: Optional[str]
|
|
1414
|
+
if args.auto_generate_schema:
|
|
1415
|
+
if args.schema_path:
|
|
1416
|
+
raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
|
|
1417
|
+
schema_path = None
|
|
1418
|
+
elif args.schema_path:
|
|
1419
|
+
schema_path = args.schema_path
|
|
1420
|
+
else:
|
|
1421
|
+
schema_path = get_default_schema_path()
|
|
1258
1422
|
|
|
1259
1423
|
register_builtin_configs_from_helm_package()
|
|
1260
1424
|
register_configs_from_directory(args.local_path)
|
|
@@ -1340,8 +1504,19 @@ def main():
|
|
|
1340
1504
|
default=None,
|
|
1341
1505
|
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1342
1506
|
)
|
|
1507
|
+
parser.add_argument(
|
|
1508
|
+
"--log-config",
|
|
1509
|
+
type=str,
|
|
1510
|
+
default=None,
|
|
1511
|
+
help="PATH to a YAML file to customize logging",
|
|
1512
|
+
)
|
|
1513
|
+
parser.add_argument(
|
|
1514
|
+
"--auto-generate-schema",
|
|
1515
|
+
action="store_true",
|
|
1516
|
+
help="EXPERIMENTAL: Auto-generate schema",
|
|
1517
|
+
)
|
|
1343
1518
|
args = parser.parse_args()
|
|
1344
|
-
setup_default_logging()
|
|
1519
|
+
setup_default_logging(args.log_config)
|
|
1345
1520
|
summarize(args)
|
|
1346
1521
|
|
|
1347
1522
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class TaxonomyInfo:
|
|
7
|
+
# Task (e.g., question answering)
|
|
8
|
+
task: Optional[str] = None
|
|
9
|
+
|
|
10
|
+
# Domain - genre (e.g., Wikipedia)
|
|
11
|
+
what: Optional[str] = None
|
|
12
|
+
|
|
13
|
+
# Domain - when it was written (e.g., 2010s)
|
|
14
|
+
when: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
# Domain - demographics (e.g., web users)
|
|
17
|
+
who: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
# Language (e.g., English)
|
|
20
|
+
language: Optional[str] = None
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
#
|
|
1
|
+
# type: ignore
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
2
5
|
from helm.common.general import asdict_without_nones
|
|
3
6
|
from helm.benchmark.presentation.table import Table, Cell, HeaderCell
|
|
4
7
|
from helm.benchmark.presentation.create_plots import parse_table
|
helm/benchmark/run.py
CHANGED
|
@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
|
|
|
37
37
|
run_specs: List[RunSpec] = []
|
|
38
38
|
for entry in run_entries:
|
|
39
39
|
# Filter by priority
|
|
40
|
-
if priority is not None and entry.priority > priority:
|
|
40
|
+
if priority is not None and entry.priority is not None and entry.priority > priority:
|
|
41
41
|
continue
|
|
42
42
|
|
|
43
43
|
for run_spec in construct_run_specs(parse_object_spec(entry.description)):
|
|
@@ -298,8 +298,7 @@ def helm_run(args):
|
|
|
298
298
|
hlog("Done.")
|
|
299
299
|
|
|
300
300
|
|
|
301
|
-
|
|
302
|
-
def main():
|
|
301
|
+
def build_parser():
|
|
303
302
|
parser = argparse.ArgumentParser()
|
|
304
303
|
add_service_args(parser)
|
|
305
304
|
parser.add_argument(
|
|
@@ -365,9 +364,21 @@ def main():
|
|
|
365
364
|
default=None,
|
|
366
365
|
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
367
366
|
)
|
|
367
|
+
parser.add_argument(
|
|
368
|
+
"--log-config",
|
|
369
|
+
type=str,
|
|
370
|
+
default=None,
|
|
371
|
+
help="PATH to a YAML file to customize logging",
|
|
372
|
+
)
|
|
368
373
|
add_run_args(parser)
|
|
374
|
+
return parser
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# Separate parsing from starting HELM so we can setup logging
|
|
378
|
+
def main():
|
|
379
|
+
parser = build_parser()
|
|
369
380
|
args = parser.parse_args()
|
|
370
|
-
setup_default_logging()
|
|
381
|
+
setup_default_logging(args.log_config)
|
|
371
382
|
return helm_run(args)
|
|
372
383
|
|
|
373
384
|
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -1484,6 +1484,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1484
1484
|
instructions = "Answer with only a single letter. Do not include a period in your answer."
|
|
1485
1485
|
elif self.scenario == "mcqa_only_last_question":
|
|
1486
1486
|
instructions = "Answer only the last question with only a single letter."
|
|
1487
|
+
elif self.scenario == "arabic_mcqa":
|
|
1488
|
+
instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
|
|
1487
1489
|
else:
|
|
1488
1490
|
instructions = "Answer with only a single letter."
|
|
1489
1491
|
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
@@ -1525,6 +1527,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1525
1527
|
"Answer only the last question with a short answer. "
|
|
1526
1528
|
"Avoid extra, unnecessary information in the answer."
|
|
1527
1529
|
)
|
|
1530
|
+
elif self.scenario == "arabic_mcqa":
|
|
1531
|
+
instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
|
|
1528
1532
|
else:
|
|
1529
1533
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1530
1534
|
elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|