PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/benchmark/metrics/unitxt_metrics.py CHANGED Viewed

@@ -5,12 +5,12 @@ from typing import Dict, List, Set
 from datasets import load_dataset
 import evaluate
-from helm.common.general import hlog
 from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hwarn
 class UnitxtMetric(MetricInterface):
@@ -18,7 +18,10 @@ class UnitxtMetric(MetricInterface):
     def __init__(self, **kwargs):
         super().__init__()
-        dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
+        if len(kwargs) == 1 and "recipe" in kwargs:
+            dataset_name = kwargs["recipe"]
+        else:
+            dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
         self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
     def evaluate(
@@ -83,9 +86,8 @@ class UnitxtMetric(MetricInterface):
                 )
             )
         if non_number_instance_metric_names:
-            hlog(
-                "WARNING: Ignored Unitxt instance metrics because "
-                f"they were not numbers: {non_number_instance_metric_names}"
+            hwarn(
+                "Ignored Unitxt instance metrics because " f"they were not numbers: {non_number_instance_metric_names}"
             )
         # Extract global metrics

helm/benchmark/metrics/vision_language/emd_utils.py CHANGED Viewed

@@ -320,8 +320,10 @@ def compute_emd_recursive(
         mask1 = np.any(sig1[:, 1:-2] != gray_most_frequent_color, axis=1)
         mask2 = np.any(sig2[:, 1:-2] != gray_most_frequent_color, axis=1)
         mask = np.logical_or(mask1, mask2)
-        sig1 = sig1[mask]
-        sig2 = sig2[mask]
+        if np.any(mask):
+            sig1 = sig1[mask]
+            sig2 = sig2[mask]
     # Normalize the weights
     weight1 = sig1[:, 0]

helm/benchmark/metrics/vision_language/image_metrics.py CHANGED Viewed

@@ -324,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
         # Compute the LPIPS score
         assert self._lpips_metric is not None
-        score: float = self._lpips_metric(img1, img2).detach().item()
+        score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
         return score
     def _calculate_fid(self, act1, act2):

helm/benchmark/metrics/vision_language/image_utils.py CHANGED Viewed

@@ -84,8 +84,8 @@ def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
     orb = cv2.ORB_create() if hasattr(cv2, "ORB_create") else cv2.ORB()
     # Find the keypoints and descriptors with ORB
-    _, desc_a = orb.detectAndCompute(img_a, None)
-    _, desc_b = orb.detectAndCompute(img_b, None)
+    _, desc_a = orb.detectAndCompute(img_a, None)  # type: ignore
+    _, desc_b = orb.detectAndCompute(img_b, None)  # type: ignore
     # Initialize the brute force matcher
     bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)

helm/benchmark/metrics/wildbench_metrics.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, Dict, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class WildBenchScoreMetric(Metric):
+    """Score metrics for WildBench."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotations: Dict[str, Any] = request_state.annotations["wildbench"]
+        scores: List[float] = []
+        for annotation_key, annotation_value in annotations.items():
+            if annotation_key.endswith("_score") and annotation_value is not None:
+                scores.append(annotation_value)
+        if not scores:
+            raise ValueError("Could not compute WB Score because all annotators failed.")
+        score = sum(scores) / len(scores)
+        score_rescaled = (score - 1) / 9
+        return [
+            Stat(MetricName("wildbench_score")).add(score),
+            Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
+        ]

helm/benchmark/model_deployment_registry.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 import cattrs
 import yaml
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.common.object_spec import ObjectSpec
 from helm.benchmark.model_metadata_registry import (
     ModelMetadata,
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
     try:
         model_metadata = get_model_metadata(model_name)
     except ValueError:
-        hlog(
-            f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
-        )
+        hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
         model_metadata = get_unknown_model_metadata(model_name)
         register_model_metadata(model_metadata)
     deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
         raise ValueError(f"Model deployment {name} not found")
     deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
     if deployment.deprecated and warn_deprecated:
-        hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
+        hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
     return deployment
@@ -182,7 +180,7 @@ def get_default_model_deployment_for_model(
         deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
         if deployment.deprecated and ignore_deprecated:
             if warn_arg_deprecated:
-                hlog(f"WARNING: Model deployment {model_name} is deprecated")
+                hwarn(f"Model deployment {model_name} is deprecated")
             return None
         return deployment.name
@@ -193,7 +191,7 @@ def get_default_model_deployment_for_model(
     if len(available_deployments) > 0:
         available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
         if warn_arg_deprecated:
-            hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
+            hwarn("Model name is deprecated. Please use the model deployment name instead.")
             hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
         # Additionally, if there is a non-deprecated deployment, use it.
@@ -210,7 +208,7 @@ def get_default_model_deployment_for_model(
         else:
             chosen_deployment = available_deployments[0]
             if warn_arg_deprecated:
-                hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
+                hwarn(f"All model deployments for model {model_name} are deprecated.")
         if warn_arg_deprecated:
             hlog(
                 f"Choosing {chosen_deployment.name} (the first one) as "

helm/benchmark/model_metadata_registry.py CHANGED Viewed

@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 # OpenAI Chat format
 OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
+# For NOVA models
+NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
 # For Anthropic models
 ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
 ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
 LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
 FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
+# For Audio-langauge models (AudioLMs)
+AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
 # Deprecated models that are no longer available.
 # These are usually closed API models that have been permanently removed
 DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
+# Unsupported models.
+# These are models that we have chosen not to support because they are
+# private, stale, non-notable, or difficult to implement.
+UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
     return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
+def is_audiolm(model_name: str) -> bool:
+    """Returns True if the model is a audio-language model (AudioLM). False otherwise."""
+    return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
 def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
     """Return placeholder ModelMetadata for an unknown model."""
     return ModelMetadata(

helm/benchmark/presentation/contamination.py CHANGED Viewed

@@ -4,7 +4,7 @@ import dacite
 import importlib_resources as resources
 import yaml
-from helm.common.hierarchical_logger import htrack, hlog
+from helm.common.hierarchical_logger import htrack, hlog, hwarn
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
 from helm.benchmark.presentation.schema import Schema
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
     for point in contamination.points:
         for model in point.models:
             if model not in MODEL_NAME_TO_MODEL_METADATA:
-                hlog(f"WARNING: model {model} not defined in schema")
+                hwarn(f"model {model} not defined in schema")
         for group in point.groups:
             if group not in schema.name_to_run_group:
-                hlog(f"WARNING: group {group} not defined in schema")
+                hwarn(f"group {group} not defined in schema")
 def read_contamination():

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -11,7 +11,7 @@ import numpy as np
 from scipy.stats import pearsonr
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, setup_default_logging
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
@@ -600,17 +600,7 @@ class Plotter:
         self.create_constrast_set_plots()
-def main():
-    """
-    This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
-    It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
-    the top-level command `helm-create-plots`.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
-    parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
-    parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
-    args = parser.parse_args()
+def create_plots(args):
     register_builtin_configs_from_helm_package()
     base_path = os.path.join(args.output_path, "runs", args.suite)
     if not os.path.exists(os.path.join(base_path, "groups")):
@@ -621,5 +611,36 @@ def main():
     plotter.create_all_plots()
+def main():
+    """
+    This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
+    It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
+    the top-level command `helm-create-plots`.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        help="Path to benchmarking output",
+        default="benchmark_output",
+    )
+    parser.add_argument(
+        "--suite",
+        type=str,
+        help="Name of the suite that we are plotting",
+        required=True,
+    )
+    parser.add_argument(
+        "--plot-format",
+        help="Format for saving plots",
+        default="png",
+        choices=["png", "pdf"],
+    )
+    args = parser.parse_args()
+    setup_default_logging()
+    create_plots(args)
 if __name__ == "__main__":
     main()

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -59,6 +59,9 @@ class DisplayPrediction:
     annotations: Optional[Dict[str, Any]]
+    thinking_text: Optional[str]
+    """Thinking text from thinking models."""
 @dataclass(frozen=True)
 class DisplayRequest:
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             request_state.instance
         )
+        if request_state.result.completions[0].multimodal_content:
+            additional_prediction: str = request_state.result.completions[0].multimodal_content.text
+            if additional_prediction:
+                predicted_text = f"{additional_prediction} {predicted_text}"
         # Process images and include if they exist
         images: List[str] = [
             encode_base64(image_location)
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             if os.path.exists(image_location)
         ]
+        thinking_text: Optional[str] = (
+            request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
+        )
         predictions.append(
             DisplayPrediction(
                 instance_id=request_state.instance.id,
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
                 reference_index=request_state.reference_index,
                 stats=trial_stats,
                 annotations=request_state.annotations,
+                thinking_text=thinking_text,
             )
         )
         requests.append(

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -11,6 +11,7 @@ import importlib_resources as resources
 from helm.common.general import hlog
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
+from helm.common.hierarchical_logger import hwarn
 # TODO: change to `helm.benchmark.config`
@@ -281,5 +282,5 @@ def read_schema(schema_path: str) -> Schema:
         raw = yaml.safe_load(f)
     schema = dacite.from_dict(Schema, raw)
     if schema.adapter:
-        hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
+        hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
     return dataclasses.replace(schema, adapter=get_adapter_fields())

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -30,7 +30,7 @@ from helm.common.general import (
     unique_simplification,
 )
 from helm.common.codec import from_json
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.metrics.metric_name import MetricName
@@ -52,6 +52,10 @@ from helm.benchmark.presentation.schema import (
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
 from helm.benchmark.presentation.run_display import write_run_display_json
 from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
+from helm.common.object_spec import get_class_by_name
+MODEL_HEADER_CELL_VALUE = "Model"
 @dataclass(frozen=True)
@@ -98,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
         # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
         # as the main metric but multiple_choice_separate_original only generates exact_match
         if matcher.name == "quasi_exact_match":
-            hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
+            hwarn("No quasi_exact_match metric found, looking for exact_match instead")
             matcher = replace(matcher, name="exact_match")
             matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
             if len(matching_stats) == 0:
@@ -262,18 +266,22 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
     """
     row_means: List[Optional[float]] = []
+    # if the first column contains the names of models, do not treat it like a value column
+    skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
     # check for all header cells where specified, that lower_is_better is consistent
     orderings = []
-    for elem in table.header:
-        orderings.append(elem.lower_is_better)
+    header_cells = table.header[1:] if skip_first_column else table.header
+    for header_cell in header_cells:
+        orderings.append(header_cell.lower_is_better)
     if len(set(orderings)) != 1:
         raise Exception("Cannot mean columns with different values for lower_is_better")
     for row in table.rows:
         total = 0.0
         count = 0
-        for cell in row:
+        row_cells = row[1:] if skip_first_column else row
+        for cell in row_cells:
             if cell.value is not None:
                 total += float(cell.value)
                 count += 1
@@ -398,8 +406,8 @@ class Summarizer:
                 included = False
             for run_group_name in run.run_spec.groups:  # go through the groups of the run to determine visibility
                 if run_group_name not in self.schema.name_to_run_group:
-                    hlog(
-                        f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
+                    hwarn(
+                        f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
                         f"but undefined in {self.schema_path}, skipping"
                     )
                     continue
@@ -432,14 +440,14 @@ class Summarizer:
             run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
             if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
-                hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
+                hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
                 continue
             run_path: str = os.path.join(run_suite_path, run_dir_name)
             run = self.read_run(run_path)
             self.runs.append(run)
             if run.run_spec.name in self.runs_to_run_suites:
-                hlog(
-                    f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
+                hwarn(
+                    f"Run entry {run.run_spec.name} is present in two different Run Suites. "
                     f"Defaulting to the latest assigned suite: {suite}"
                 )
             self.runs_to_run_suites[run.run_spec.name] = suite
@@ -536,8 +544,8 @@ class Summarizer:
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
-                hlog(
-                    f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
+                hwarn(
+                    f"metric name {metric_name} undefined in {self.schema_path} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
@@ -730,8 +738,8 @@ class Summarizer:
             if stat is None:
                 # Print out near misses to provide a more informative warning
                 near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
-                hlog(
-                    f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
+                hwarn(
+                    f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
                     f"{len(near_misses)} near misses matching just the name"
                 )
                 if len(near_misses) > 0:
@@ -802,7 +810,7 @@ class Summarizer:
         # Create header (cells to display) and the list of metric name filters
         # (to pull out information later).
         if not columns or not adapter_to_runs:
-            hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
+            hwarn(f"table {title}, has no rows or columns, leaving empty")
             return Table("empty", [], [])
         header: List[HeaderCell] = []
@@ -811,7 +819,7 @@ class Summarizer:
         num_groups = len(set(run_group.name for run_group, _ in columns))  # number of unique groups, determines headers
         # Column headers
-        header.append(HeaderCell("Model/adapter"))
+        header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
         for run_group, metric_group_name in columns:
             # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
             if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -823,7 +831,7 @@ class Summarizer:
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
+                    hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -951,8 +959,8 @@ class Summarizer:
             all_run_spec_names = []
             for adapter_spec, runs in adapter_to_runs.items():
                 if len(runs) > 1:
-                    hlog(
-                        f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
+                    hwarn(
+                        f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
                         f" {[run.run_spec.name for run in runs]}"
                     )
                 for run in runs:
@@ -969,22 +977,20 @@ class Summarizer:
             if strategy == AggregationStrategy.WIN_RATE:
                 WIN_RATE_AGGREGATION = "mean"
                 win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-                description = "How many models this model outperforms on average (over columns)."
                 aggregate_header_cells.append(
                     HeaderCell(
                         f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
-                        description=description,
+                        description="How many models this model outperforms on average (over columns).",
                         lower_is_better=False,
                     )
                 )
                 aggregate_row_values.append(win_rates)
             elif strategy == AggregationStrategy.MEAN:
                 means = compute_aggregate_row_means(table)
-                description = "An average over columns representing the mean performance."
                 aggregate_header_cells.append(
                     HeaderCell(
-                        "Mean performance",
-                        description=description,
+                        "Mean score",
+                        description="The mean of the scores from all columns.",
                         lower_is_better=table.header[0].lower_is_better,
                     )
                 )
@@ -1226,10 +1232,57 @@ class Summarizer:
 @htrack("summarize")
+def summarize(args):
+    release: Optional[str] = None
+    suites: Optional[str] = None
+    suite: Optional[str] = None
+    if args.suite and (args.release or args.suites):
+        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
+    elif args.suite:
+        # Comment this out while we have a trial period for the `release` method.
+        # hlog(
+        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
+        #     "where --release specifies the name of a release and --suites specifies several run suites "
+        #     "to be included in that release."
+        # )
+        suite = args.suite
+    elif args.release or args.suites:
+        if not args.release or not args.suites:
+            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
+        release = args.release
+        suites = args.suites
+    else:
+        raise ValueError("Exactly one of --release or --suite must be specified.")
+    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
+    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
+    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
+    summarizer = summarizer_cls(
+        release=release,
+        suites=suites,
+        suite=suite,
+        schema_path=schema_path,
+        output_path=args.output_path,
+        verbose=args.debug,
+        num_threads=args.num_threads,
+        allow_unknown_models=args.allow_unknown_models,
+    )
+    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
+    hlog("Done.")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
+        "-o",
+        "--output-path",
+        type=str,
+        help="Where the benchmarking output lives",
+        default="benchmark_output",
     )
     parser.add_argument(
         "--schema-path",
@@ -1247,9 +1300,18 @@ def main():
         help="Experimental: Name of the release this summarization should go under.",
     )
     parser.add_argument(
-        "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
+        "--suites",
+        type=str,
+        nargs="+",
+        help="Experimental: List of suites to summarize for this this release.",
+    )
+    parser.add_argument(
+        "-n",
+        "--num-threads",
+        type=int,
+        help="Max number of threads used to summarize",
+        default=8,
     )
-    parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -1272,47 +1334,15 @@ def main():
         help="Whether to allow unknown models in the metadata file",
         default=True,
     )
-    args = parser.parse_args()
-    release: Optional[str] = None
-    suites: Optional[str] = None
-    suite: Optional[str] = None
-    if args.suite and (args.release or args.suites):
-        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
-    elif args.suite:
-        # Comment this out while we have a trial period for the `release` method.
-        # hlog(
-        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
-        #     "where --release specifies the name of a release and --suites specifies several run suites "
-        #     "to be included in that release."
-        # )
-        suite = args.suite
-    elif args.release or args.suites:
-        if not args.release or not args.suites:
-            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
-        release = args.release
-        suites = args.suites
-    else:
-        raise ValueError("Exactly one of --release or --suite must be specified.")
-    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
-    register_builtin_configs_from_helm_package()
-    register_configs_from_directory(args.local_path)
-    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
-    summarizer = Summarizer(
-        release=release,
-        suites=suites,
-        suite=suite,
-        schema_path=schema_path,
-        output_path=args.output_path,
-        verbose=args.debug,
-        num_threads=args.num_threads,
-        allow_unknown_models=args.allow_unknown_models,
+    parser.add_argument(
+        "--summarizer-class-name",
+        type=str,
+        default=None,
+        help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
     )
-    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
-    hlog("Done.")
+    args = parser.parse_args()
+    setup_default_logging()
+    summarize(args)
 if __name__ == "__main__":

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl