PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show

crfm_helm-0.5.10.dist-info/METADATA +369 -0
crfm_helm-0.5.10.dist-info/RECORD +1008 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +80 -29
helm/benchmark/adaptation/adapters/adapter.py +2 -2
helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
helm/benchmark/adaptation/common_adapter_specs.py +443 -0
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/adaptation/request_state.py +6 -1
helm/benchmark/adaptation/scenario_state.py +6 -2
helm/benchmark/annotation/aci_bench_annotator.py +84 -0
helm/benchmark/annotation/air_bench_annotator.py +79 -0
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/annotator.py +48 -0
helm/benchmark/annotation/annotator_factory.py +50 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/call_center_annotator.py +258 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +96 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +55 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
helm/benchmark/annotation/live_qa_annotator.py +76 -0
helm/benchmark/annotation/med_dialog_annotator.py +88 -0
helm/benchmark/annotation/medalign_annotator.py +89 -0
helm/benchmark/annotation/medi_qa_annotator.py +87 -0
helm/benchmark/annotation/medication_qa_annotator.py +86 -0
helm/benchmark/annotation/mental_health_annotator.py +87 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
helm/benchmark/annotation/model_as_judge.py +309 -0
helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
helm/benchmark/annotation/test_annotator_factory.py +26 -0
helm/benchmark/annotation/test_dummy_annotator.py +44 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation/xstest_annotator.py +100 -0
helm/benchmark/annotation_executor.py +144 -0
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/data_augmenter.py +0 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +3 -3
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +26 -4
helm/benchmark/augmentations/perturbation_description.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +29 -0
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +56 -19
helm/benchmark/augmentations/translate_perturbation.py +31 -0
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/config_registry.py +7 -1
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +54 -25
helm/benchmark/huggingface_registration.py +28 -10
helm/benchmark/metrics/air_bench_metrics.py +3212 -0
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/basic_metrics.py +437 -667
helm/benchmark/metrics/bbq_metrics.py +17 -6
helm/benchmark/metrics/bias_metrics.py +18 -9
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/classification_metrics.py +107 -22
helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/code_metrics_helper.py +11 -3
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +174 -0
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
helm/benchmark/metrics/copyright_metrics.py +5 -5
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
helm/benchmark/metrics/disinformation_metrics.py +8 -114
helm/benchmark/metrics/dry_run_metrics.py +35 -6
helm/benchmark/metrics/efficiency_metrics.py +287 -0
helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +67 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
helm/benchmark/metrics/language_modeling_metrics.py +111 -0
helm/benchmark/metrics/live_qa_metrics.py +35 -0
helm/benchmark/metrics/llm_jury_metrics.py +58 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/machine_translation_metrics.py +89 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
helm/benchmark/metrics/medec_metrics.py +124 -0
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/metric.py +121 -175
helm/benchmark/metrics/metric_name.py +0 -1
helm/benchmark/metrics/metric_service.py +23 -7
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/omni_math_metrics.py +44 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/ranking_metrics.py +5 -5
helm/benchmark/metrics/reference_metric.py +148 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/safety_metrics.py +91 -0
helm/benchmark/metrics/seahelm_metrics.py +201 -0
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +8 -11
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +150 -11
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +145 -70
helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
helm/benchmark/metrics/test_metric.py +3 -3
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
helm/benchmark/metrics/toxicity_metrics.py +37 -7
helm/benchmark/metrics/toxicity_utils.py +23 -0
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/unitxt_metrics.py +107 -0
helm/benchmark/metrics/vision_language/__init__.py +0 -0
helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
helm/benchmark/metrics/vision_language/image_utils.py +100 -0
helm/benchmark/metrics/wildbench_metrics.py +54 -0
helm/benchmark/model_deployment_registry.py +69 -5
helm/benchmark/model_metadata_registry.py +58 -2
helm/benchmark/multi_gpu_runner.py +133 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +51 -20
helm/benchmark/presentation/run_display.py +51 -12
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +83 -66
helm/benchmark/presentation/summarize.py +483 -388
helm/benchmark/presentation/table.py +8 -8
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_contamination.py +2 -2
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/presentation/test_run_entry.py +2 -2
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/presentation/test_summarize.py +148 -6
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +151 -87
helm/benchmark/run_expander.py +418 -33
helm/benchmark/run_spec.py +93 -0
helm/benchmark/run_spec_factory.py +180 -0
helm/benchmark/run_specs/__init__.py +0 -0
helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/call_center_run_specs.py +201 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1393 -0
helm/benchmark/run_specs/cleva_run_specs.py +277 -0
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +224 -0
helm/benchmark/run_specs/finance_run_specs.py +114 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +625 -0
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
helm/benchmark/run_specs/lite_run_specs.py +307 -0
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +188 -0
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +191 -0
helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
helm/benchmark/run_specs/simple_run_specs.py +104 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +63 -62
helm/benchmark/runner_config_registry.py +21 -0
helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
helm/benchmark/scenarios/air_bench_scenario.py +76 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
helm/benchmark/scenarios/banking77_scenario.py +77 -0
helm/benchmark/scenarios/bbq_scenario.py +17 -2
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +18 -3
helm/benchmark/scenarios/boolq_scenario.py +21 -1
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
helm/benchmark/scenarios/clear_scenario.py +180 -0
helm/benchmark/scenarios/cleva_scenario.py +482 -3
helm/benchmark/scenarios/code_scenario.py +46 -4
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +33 -1
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
helm/benchmark/scenarios/disinformation_scenario.py +32 -1
helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
helm/benchmark/scenarios/financebench_scenario.py +74 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
helm/benchmark/scenarios/gpqa_scenario.py +98 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +21 -2
helm/benchmark/scenarios/gsm_scenario.py +31 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
helm/benchmark/scenarios/headqa_scenario.py +158 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
helm/benchmark/scenarios/ice_scenario.py +28 -4
helm/benchmark/scenarios/ifeval_scenario.py +71 -0
helm/benchmark/scenarios/image_generation/__init__.py +0 -0
helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +26 -3
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
helm/benchmark/scenarios/legal_support_scenario.py +24 -1
helm/benchmark/scenarios/legalbench_scenario.py +45 -3
helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
helm/benchmark/scenarios/lextreme_scenario.py +22 -1
helm/benchmark/scenarios/live_qa_scenario.py +94 -0
helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +81 -22
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +30 -1
helm/benchmark/scenarios/medalign_scenario.py +117 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
helm/benchmark/scenarios/medbullets_scenario.py +167 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
helm/benchmark/scenarios/medec_scenario.py +148 -0
helm/benchmark/scenarios/medhallu_scenario.py +95 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +146 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
helm/benchmark/scenarios/mmlu_scenario.py +32 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +31 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +71 -0
helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
helm/benchmark/scenarios/quac_scenario.py +24 -1
helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
helm/benchmark/scenarios/raft_scenario.py +33 -3
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
helm/benchmark/scenarios/scenario.py +44 -1
helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
helm/benchmark/scenarios/simple_scenarios.py +122 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +109 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
helm/benchmark/scenarios/summarization_scenario.py +48 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +4 -3
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_scenario.py +6 -3
helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/unitxt_scenario.py +62 -0
helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
helm/benchmark/scenarios/vicuna_scenario.py +22 -2
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
helm/benchmark/scenarios/wikifact_scenario.py +31 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +101 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +32 -2
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +78 -50
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +269 -0
helm/benchmark/static/schema_capabilities.yaml +254 -0
helm/benchmark/static/schema_classic.yaml +259 -1140
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +191 -0
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +161 -0
helm/benchmark/static/schema_legal.yaml +566 -0
helm/benchmark/static/schema_lite.yaml +3 -286
helm/benchmark/static/schema_long_context.yaml +282 -0
helm/benchmark/static/schema_medhelm.yaml +1176 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu.yaml +1449 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +283 -0
helm/benchmark/static/schema_seahelm.yaml +723 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_torr.yaml +474 -0
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_unitxt.yaml +370 -0
helm/benchmark/static/schema_vhelm.yaml +933 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
helm/benchmark/static_build/config.js +4 -0
helm/benchmark/static_build/index.html +19 -0
helm/benchmark/test_data_preprocessor.py +3 -3
helm/benchmark/test_run_expander.py +1 -1
helm/benchmark/window_services/default_window_service.py +3 -45
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
helm/benchmark/window_services/ice_window_service.py +1 -35
helm/benchmark/window_services/image_generation/__init__.py +0 -0
helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
helm/benchmark/window_services/local_window_service.py +22 -5
helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
helm/benchmark/window_services/test_bloom_window_service.py +5 -4
helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
helm/benchmark/window_services/test_gptj_window_service.py +11 -5
helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
helm/benchmark/window_services/test_openai_window_service.py +18 -12
helm/benchmark/window_services/test_opt_window_service.py +6 -5
helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
helm/benchmark/window_services/test_t511b_window_service.py +5 -4
helm/benchmark/window_services/test_ul2_window_service.py +5 -4
helm/benchmark/window_services/test_utils.py +6 -6
helm/benchmark/window_services/test_yalm_window_service.py +5 -4
helm/benchmark/window_services/tokenizer_service.py +7 -13
helm/benchmark/window_services/window_service.py +42 -0
helm/benchmark/window_services/window_service_factory.py +4 -1
helm/benchmark/window_services/yalm_window_service.py +1 -28
helm/clients/__init__.py +0 -0
helm/{proxy/clients → clients}/ai21_client.py +78 -12
helm/clients/aleph_alpha_client.py +114 -0
helm/{proxy/clients → clients}/anthropic_client.py +304 -21
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +122 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +199 -0
helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
helm/clients/audio_language/qwen_audiolm_client.py +153 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/audio_language/test.py +62 -0
helm/{proxy/clients → clients}/auto_client.py +72 -31
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +381 -0
helm/clients/bedrock_utils.py +105 -0
helm/{proxy/clients → clients}/client.py +92 -17
helm/clients/clip_score_client.py +49 -0
helm/clients/clip_scorers/__init__.py +0 -0
helm/clients/clip_scorers/base_clip_scorer.py +18 -0
helm/clients/clip_scorers/clip_scorer.py +50 -0
helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
helm/{proxy/clients → clients}/cohere_client.py +105 -14
helm/clients/dspy_client.py +135 -0
helm/clients/gcs_client.py +82 -0
helm/{proxy/clients → clients}/google_client.py +8 -6
helm/clients/google_translate_client.py +35 -0
helm/clients/grok_client.py +36 -0
helm/{proxy/clients → clients}/http_model_client.py +8 -8
helm/{proxy/clients → clients}/huggingface_client.py +157 -86
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +269 -0
helm/clients/image_generation/__init__.py +0 -0
helm/clients/image_generation/adobe_vision_client.py +80 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
helm/clients/image_generation/cogview2/__init__.py +0 -0
helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
helm/clients/image_generation/cogview2_client.py +192 -0
helm/clients/image_generation/dalle2_client.py +194 -0
helm/clients/image_generation/dalle3_client.py +108 -0
helm/clients/image_generation/dalle_mini/__init__.py +3 -0
helm/clients/image_generation/dalle_mini/data.py +442 -0
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
helm/clients/image_generation/dalle_mini/model/text.py +251 -0
helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
helm/clients/image_generation/dalle_mini_client.py +191 -0
helm/clients/image_generation/deep_floyd_client.py +80 -0
helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
helm/clients/image_generation/image_generation_client_utils.py +9 -0
helm/clients/image_generation/lexica_client.py +88 -0
helm/clients/image_generation/mindalle/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/__init__.py +216 -0
helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
helm/clients/image_generation/mindalle/utils/config.py +129 -0
helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
helm/clients/image_generation/mindalle/utils/utils.py +89 -0
helm/clients/image_generation/mindalle_client.py +116 -0
helm/clients/image_generation/nudity_check_client.py +64 -0
helm/clients/image_generation/together_image_generation_client.py +113 -0
helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
helm/{proxy/clients → clients}/megatron_client.py +7 -5
helm/clients/mistral_client.py +180 -0
helm/clients/moderation_api_client.py +111 -0
helm/clients/nvidia_nim_client.py +32 -0
helm/clients/open_lm_client.py +43 -0
helm/clients/openai_client.py +604 -0
helm/clients/openai_responses_client.py +200 -0
helm/clients/openrouter_client.py +31 -0
helm/{proxy/clients → clients}/palmyra_client.py +31 -14
helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
helm/clients/reka_client.py +190 -0
helm/clients/simple_client.py +64 -0
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +95 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/{proxy/clients → clients}/test_auto_client.py +13 -15
helm/clients/test_client.py +98 -0
helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
helm/clients/test_openrouter_client.py +69 -0
helm/clients/test_simple_client.py +19 -0
helm/clients/test_together_client.py +184 -0
helm/clients/together_client.py +599 -0
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +488 -0
helm/clients/vision_language/__init__.py +0 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
helm/clients/vision_language/huggingface_vlm_client.py +114 -0
helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
helm/clients/vision_language/open_flamingo/__init__.py +2 -0
helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
helm/clients/vision_language/open_flamingo_client.py +155 -0
helm/clients/vision_language/paligemma_client.py +147 -0
helm/clients/vision_language/palmyra_vision_client.py +101 -0
helm/clients/vision_language/qwen2_vlm_client.py +189 -0
helm/clients/vision_language/qwen_vlm_client.py +174 -0
helm/clients/vllm_client.py +80 -0
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +105 -0
helm/clients/yi_client.py +28 -0
helm/common/audio_utils.py +111 -0
helm/common/cache.py +23 -33
helm/common/cache_backend_config.py +47 -0
helm/common/clip_score_request.py +41 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +10 -2
helm/common/file_caches/__init__.py +0 -0
helm/common/file_caches/file_cache.py +16 -0
helm/common/file_caches/local_file_cache.py +61 -0
helm/common/file_caches/test_local_file_cache.py +25 -0
helm/common/file_upload_request.py +27 -0
helm/common/general.py +10 -3
helm/common/hierarchical_logger.py +124 -12
helm/common/image_generation_parameters.py +25 -0
helm/common/images_utils.py +60 -5
helm/common/key_value_store.py +41 -10
helm/common/local_context.py +140 -0
helm/common/media_object.py +14 -1
helm/common/moderations_api_request.py +71 -0
helm/common/mongo_key_value_store.py +8 -7
helm/common/multimodal_request_utils.py +57 -0
helm/common/nudity_check_request.py +29 -0
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +45 -19
helm/common/response_format.py +18 -0
helm/common/test_cache.py +1 -48
helm/common/test_general.py +10 -0
helm/common/test_logging.py +94 -0
helm/common/test_media_object.py +1 -1
helm/common/tokenization_request.py +1 -10
helm/config/model_deployments.yaml +4713 -1005
helm/config/model_metadata.yaml +4045 -255
helm/config/tokenizer_configs.yaml +1091 -50
helm/proxy/accounts.py +31 -4
helm/proxy/cli.py +6 -4
helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/critique/model_critique_client.py +40 -10
helm/proxy/example_queries.py +33 -28
helm/proxy/retry.py +5 -0
helm/proxy/server.py +82 -18
helm/proxy/services/remote_service.py +32 -7
helm/proxy/services/server_service.py +71 -69
helm/proxy/services/service.py +30 -6
helm/proxy/services/test_remote_service.py +6 -5
helm/proxy/services/test_service.py +1 -13
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +61 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +462 -0
helm/proxy/test_accounts.py +32 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +37 -37
helm/proxy/token_counters/test_auto_token_counter.py +164 -0
helm/proxy/token_counters/token_counter.py +3 -5
helm/tokenizers/__init__.py +0 -0
helm/tokenizers/ai21_tokenizer.py +52 -0
helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
helm/tokenizers/cohere_tokenizer.py +50 -0
helm/tokenizers/grok_tokenizer.py +55 -0
helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/simple_tokenizer.py +33 -0
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
helm/tokenizers/test_simple_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
crfm_helm-0.4.0.dist-info/METADATA +0 -264
crfm_helm-0.4.0.dist-info/RECORD +0 -397
helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
helm/benchmark/data_overlap/export_scenario_text.py +0 -119
helm/benchmark/data_overlap/light_scenario.py +0 -60
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/run_specs.py +0 -2762
helm/benchmark/scenarios/numeracy_scenario.py +0 -784
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/utils.js +0 -285
helm/benchmark/test_model_deployment_definition.py +0 -92
helm/benchmark/test_model_properties.py +0 -1570
helm/benchmark/vlm_run_specs.py +0 -97
helm/benchmark/window_services/ai21_window_service.py +0 -258
helm/benchmark/window_services/cohere_window_service.py +0 -163
helm/benchmark/window_services/flan_t5_window_service.py +0 -29
helm/benchmark/window_services/gpt2_window_service.py +0 -32
helm/benchmark/window_services/huggingface_window_service.py +0 -60
helm/benchmark/window_services/t0pp_window_service.py +0 -35
helm/benchmark/window_services/t511b_window_service.py +0 -30
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -74
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -326
helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
helm/benchmark/window_services/ul2_window_service.py +0 -30
helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
helm/common/cache_utils.py +0 -14
helm/proxy/clients/aleph_alpha_client.py +0 -95
helm/proxy/clients/goose_ai_client.py +0 -99
helm/proxy/clients/microsoft_client.py +0 -180
helm/proxy/clients/openai_client.py +0 -206
helm/proxy/clients/simple_client.py +0 -60
helm/proxy/clients/test_client.py +0 -49
helm/proxy/clients/test_together_client.py +0 -97
helm/proxy/clients/together_client.py +0 -334
helm/proxy/clients/vertexai_client.py +0 -115
helm/proxy/token_counters/ai21_token_counter.py +0 -20
helm/proxy/token_counters/cohere_token_counter.py +0 -13
helm/proxy/token_counters/free_token_counter.py +0 -12
helm/proxy/token_counters/gooseai_token_counter.py +0 -24
helm/proxy/token_counters/openai_token_counter.py +0 -22
helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
helm/proxy/token_counters/test_openai_token_counter.py +0 -81
helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
helm/proxy/tokenizers/ice_tokenizer.py +0 -30
helm/proxy/tokenizers/simple_tokenizer.py +0 -32
helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
/helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
/helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
/helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
/helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
/helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
/helm/{proxy/clients → clients}/ai21_utils.py +0 -0
/helm/{proxy/clients → clients}/cohere_utils.py +0 -0
/helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
/helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
/helm/{benchmark → proxy}/static/general.js +0 -0
/helm/{benchmark → proxy}/static/info-icon.png +0 -0

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -9,12 +9,11 @@ Usage:
 """
 import argparse
-import cattrs
+import dataclasses
 import os
 import datetime
 import urllib.parse
 import json
-import yaml
 from collections import defaultdict
 from dataclasses import dataclass, replace
 from statistics import mean, median
@@ -22,50 +21,50 @@ from typing import List, Optional, Dict, Any, Tuple, Set
 from tqdm import tqdm
 from helm.benchmark.model_deployment_registry import get_model_deployment
 from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
 from helm.common.general import (
     write,
     ensure_directory_exists,
     asdict_without_nones,
-    serialize_dates,
     parallel_map,
     singleton,
     unique_simplification,
 )
 from helm.common.codec import from_json
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
-from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
+from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
-from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
 from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric import get_all_stats_by_name
+from helm.benchmark.metrics.metric import (
+    MetricInterface,
+    MetricMetadata,
+    MetricSpec,
+    create_metric,
+    get_all_stats_by_name,
+)
 from helm.benchmark.metrics.statistic import Stat, merge_stat
-from helm.benchmark.runner import RunSpec, LATEST_SYMLINK
+from helm.benchmark.run_spec import RunSpec
+from helm.benchmark.runner import LATEST_SYMLINK
 from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
 from helm.benchmark.presentation.schema import (
+    MetricGroup,
     MetricNameMatcher,
     RunGroup,
+    Field,
+    Schema,
     read_schema,
-    SCHEMA_CLASSIC_YAML_FILENAME,
+    get_default_schema_path,
     BY_GROUP,
     THIS_GROUP_ONLY,
     NO_GROUPS,
 )
-from helm.benchmark.presentation.contamination import (
-    read_contamination,
-    validate_contamination,
-    CONTAMINATION_SYMBOLS,
-    CONTAMINATION_STYLES,
-    CONTAMINATION_LEVEL_STRONG,
-)
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
 from helm.benchmark.presentation.run_display import write_run_display_json
-from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
+from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
+from helm.common.object_spec import get_class_by_name
-OVERLAP_N_COUNT = 13
+MODEL_HEADER_CELL_VALUE = "Model"
 @dataclass(frozen=True)
@@ -112,7 +111,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
         # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
         # as the main metric but multiple_choice_separate_original only generates exact_match
         if matcher.name == "quasi_exact_match":
-            hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
+            hwarn("No quasi_exact_match metric found, looking for exact_match instead")
             matcher = replace(matcher, name="exact_match")
             matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
             if len(matching_stats) == 0:
@@ -172,7 +171,7 @@ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetad
     except ValueError:
         pass
-    # Return a placeholder "unknoown model" model metadata.
+    # Return a placeholder "unknown model" model metadata.
     return get_unknown_model_metadata(adapter_spec.model)
@@ -233,25 +232,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
     """
     assert aggregation in ["mean", "median"]
     win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
-    for i, header_cell in enumerate(table.header):
+    for column_index, header_cell in enumerate(table.header):
         lower_is_better = header_cell.lower_is_better
         if lower_is_better is None:  # column does not have a meaningful ordering
             continue
-        # sort row indices by cell value and then compute the number of wins as the index in the sorted list
-        def is_cell_valid(cell: Cell) -> bool:  # ignore cells which are strongly contaminated or have no value
-            if cell.value is None:
-                return False
-            if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
-                return False
-            return True
-        values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
-        if len(values) < 2:  # don't rank a single model
+        value_to_count: Dict[float, int] = defaultdict(int)
+        for row in table.rows:
+            value = row[column_index].value
+            if value is not None:
+                value_to_count[value] += 1
+        value_to_wins: Dict[float, float] = {}
+        acc_count = 0
+        for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
+            value_to_wins[value] = acc_count + ((value_count - 1) / 2)
+            acc_count += value_count
+        total_count = acc_count
+        if total_count < 2:
             continue
-        for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
-            win_rate = wins / (len(values) - 1)  # normalize to [0, 1]
-            win_rates_per_row[j].append(win_rate)
+        for row_index, row in enumerate(table.rows):
+            value = row[column_index].value
+            if value is not None:
+                win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
     # Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
     # Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
@@ -266,7 +267,47 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
     return aggregate_win_rates
-AGGREGATE_WIN_RATE_COLUMN = 1
+def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
+    """
+    Computes the aggregate mean of each row across columns.
+    Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
+    non-null values of the row are in columns we skip).
+    """
+    row_means: List[Optional[float]] = []
+    # if the first column contains the names of models, do not treat it like a value column
+    skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
+    # check for all header cells where specified, that lower_is_better is consistent
+    orderings = []
+    header_cells = table.header[1:] if skip_first_column else table.header
+    for header_cell in header_cells:
+        orderings.append(header_cell.lower_is_better)
+    if len(set(orderings)) != 1:
+        raise Exception("Cannot mean columns with different values for lower_is_better")
+    for row in table.rows:
+        total = 0.0
+        count = 0
+        row_cells = row[1:] if skip_first_column else row
+        for cell in row_cells:
+            if cell.value is not None:
+                total += float(cell.value)
+                count += 1
+        if count == 0:
+            row_means.append(None)
+        else:
+            row_means.append(total / count)
+    return row_means
+class AggregationStrategy:
+    WIN_RATE = "win_rate"
+    MEAN = "mean"
+ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
 class Summarizer:
@@ -309,7 +350,7 @@ class Summarizer:
         release: Optional[str],
         suites: Optional[List[str]],
         suite: Optional[str],
-        schema_file: str,
+        schema_path: Optional[str],
         output_path: str,
         verbose: bool,
         num_threads: int,
@@ -329,7 +370,7 @@ class Summarizer:
         self.suites: List[str]
         self.run_suite_paths: List[str]
         self.suite: Optional[str] = None
-        self.schema_file = schema_file
+        self.schema_path = schema_path
         self.release: Optional[str] = None
         if suite:
             self.suite = suite
@@ -344,12 +385,8 @@ class Summarizer:
         self.verbose: bool = verbose
         self.num_threads: int = num_threads
         self.allow_unknown_models: bool = allow_unknown_models
-        ensure_directory_exists(self.run_release_path)
-        self.schema = read_schema(schema_file)
-        self.contamination = read_contamination()
-        validate_contamination(self.contamination, self.schema)
+        self.schema = read_schema(schema_path) if schema_path else Schema()
+        self.metric_metadata: List[MetricMetadata] = []
     def read_run(self, run_path: str) -> Run:
         """Load the `Run` object from `run_path`."""
@@ -375,9 +412,9 @@ class Summarizer:
                 included = False
             for run_group_name in run.run_spec.groups:  # go through the groups of the run to determine visibility
                 if run_group_name not in self.schema.name_to_run_group:
-                    hlog(
-                        f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
-                        f"but undefined in {self.schema_file}, skipping"
+                    hwarn(
+                        f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
+                        f"but undefined in {self.schema_path}, skipping"
                     )
                     continue
                 run_group = self.schema.name_to_run_group[run_group_name]
@@ -396,6 +433,8 @@ class Summarizer:
     def read_runs_for_suite(self, suite, run_suite_path):
         """Load the runs in the run suite path."""
+        if not os.path.exists(run_suite_path):
+            raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
         # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
         # so filter them out.
         run_dir_names = sorted(
@@ -409,14 +448,14 @@ class Summarizer:
             run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
             if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
-                hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
+                hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
                 continue
             run_path: str = os.path.join(run_suite_path, run_dir_name)
             run = self.read_run(run_path)
             self.runs.append(run)
             if run.run_spec.name in self.runs_to_run_suites:
-                hlog(
-                    f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
+                hwarn(
+                    f"Run entry {run.run_spec.name} is present in two different Run Suites. "
                     f"Defaulting to the latest assigned suite: {suite}"
                 )
             self.runs_to_run_suites[run.run_spec.name] = suite
@@ -433,11 +472,205 @@ class Summarizer:
                 self.group_adapter_to_runs[group_name][adapter_spec].append(run)
                 self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
-    def write_schema(self):
+    @dataclass(frozen=True)
+    class _ModelField(Field):
+        """The frontend version of ModelMetadata.
+        The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
+        All attributes have the same meaning as in ModelMetadata."""
+        # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
+        creator_organization: Optional[str] = None
+        access: Optional[str] = None
+        todo: bool = False
+        release_date: Optional[str] = None
+        num_parameters: Optional[int] = None
+    def get_model_field_dicts(self) -> List[Dict]:
+        """Get a list of `ModelField`s dicts that will be written to schema.json.
+        The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
+        This is populated by reading the `ModelMetadata` configs and filtering down to models that were
+        actually used, and converting each `ModelMetadata` to a `ModelField`."""
+        # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
+        used_model_names: Set[str] = set()
+        for run in self.runs:
+            used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
+        model_field_dicts: List[Dict] = []
+        for model_name in get_all_models():
+            if model_name not in used_model_names:
+                continue
+            model_metadata = get_model_metadata(model_name)
+            model_field = Summarizer._ModelField(
+                name=model_metadata.name,
+                display_name=model_metadata.display_name,
+                short_display_name=model_metadata.display_name,
+                description=model_metadata.description,
+                creator_organization=model_metadata.creator_organization_name,
+                access=model_metadata.access,
+                todo=False,
+                release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
+                num_parameters=model_metadata.num_parameters,
+            )
+            model_field_dicts.append(asdict_without_nones(model_field))
+        return model_field_dicts
+    def get_metric_metadata(self) -> List[MetricMetadata]:
+        if self.metric_metadata:
+            return self.metric_metadata
+        metric_specs: List[MetricSpec] = []
+        for run in self.runs:
+            metric_specs.extend(run.run_spec.metric_specs)
+        metric_specs = list(set(metric_specs))
+        metric_name_to_metadata: Dict[str, MetricMetadata] = {}
+        for metric_spec in metric_specs:
+            try:
+                metric: MetricInterface = create_metric(metric_spec)
+                metric_metadata_list = metric.get_metadata()
+                for metric_metadata in metric_metadata_list:
+                    metric_name_to_metadata[metric_metadata.name] = metric_metadata
+            except NotImplementedError:
+                pass
+            except (ModuleNotFoundError, AttributeError, TypeError):
+                pass
+        run_stat_names: Set[str] = set()
+        for run in self.runs:
+            for stat in run.stats:
+                run_stat_names.add(stat.name.name)
+        metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
+        for metric_name_to_prune in metric_names_to_prune:
+            del metric_name_to_metadata[metric_name_to_prune]
+        self.metric_metadata = list(metric_name_to_metadata.values())
+        return self.metric_metadata
+    def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
+        return Field(
+            name=metric_metadata.name,
+            display_name=metric_metadata.display_name,
+            short_display_name=metric_metadata.short_display_name,
+            description=metric_metadata.description,
+            lower_is_better=metric_metadata.lower_is_better,
+        )
+    def auto_generate_metric_fields(self) -> List[Field]:
+        return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
+    def auto_generate_metric_groups(self) -> List[MetricGroup]:
+        metric_groups = [
+            MetricGroup(
+                name="main_metric",
+                display_name="Main Metric",
+                description="Main Metric",
+                metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
+            )
+        ]
+        metric_group_to_metrics: Dict[str, List[str]] = {}
+        for metric_metadata in self.metric_metadata:
+            if metric_metadata.group:
+                if metric_metadata.group not in metric_group_to_metrics:
+                    metric_group_to_metrics[metric_metadata.group] = []
+                metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
+        for metric_group, metric_names in metric_group_to_metrics.items():
+            display_name = metric_group.replace("_", " ").capitalize()
+            metric_groups.append(
+                MetricGroup(
+                    name=metric_group,
+                    # TODO: Make display_name and description nicer
+                    display_name=display_name,
+                    description=display_name,
+                    aggregation_strategies=[],
+                    metrics=[
+                        MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
+                    ],
+                )
+            )
+        return metric_groups
+    def get_scenario_metadata(self) -> List[ScenarioMetadata]:
+        scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
+        scenario_specs = list(set(scenario_specs))
+        scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
+        for scenario_spec in scenario_specs:
+            try:
+                scenario: Scenario = create_scenario(scenario_spec)
+                scenario_metadata = scenario.get_metadata()
+                scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
+            except NotImplementedError:
+                pass
+            except (ModuleNotFoundError, AttributeError, TypeError):
+                pass
+        run_groups: Set[str] = set()
+        for run in self.runs:
+            for run_group in run.run_spec.groups:
+                run_groups.add(run_group)
+        scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
+        for scenario_name_to_prune in scenario_names_to_prune:
+            del scenario_name_to_metadata[scenario_name_to_prune]
+        return list(scenario_name_to_metadata.values())
+    def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
+        metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
+        return RunGroup(
+            name=scenario_metadata.name,
+            display_name=scenario_metadata.display_name,
+            short_display_name=scenario_metadata.short_display_name,
+            description=scenario_metadata.description,
+            metric_groups=metric_group_names,
+            environment={
+                "main_name": scenario_metadata.main_metric,
+                "main_split": scenario_metadata.main_split,
+            },
+            taxonomy=scenario_metadata.taxonomy,
+        )
+    def auto_generate_all_scenarios_run_group(self) -> RunGroup:
+        return RunGroup(
+            name="all_scenarios",
+            display_name="All Scenarios",
+            description="All scenarios",
+            category="Scenario Groups",
+            subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
+        )
+    def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
+        return [
+            self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
+        ]
+    def fix_up_schema(self) -> None:
+        # if not self.schema.run_groups:
+        if not self.schema.metrics:
+            self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
+            # Can only auto-generate metric groups if metrics were also auto-generated
+            # because auto_generate_metric_groups() requires self.metric_metadata()
+            # which is populated by auto_generate_metric_fields()
+            if not self.schema.metric_groups:
+                self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
+        if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
+            self.schema = dataclasses.replace(
+                self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
+            )
+        if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
+            self.schema = dataclasses.replace(
+                self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
+            )
+    def write_schema(self) -> None:
         """Write the schema file to benchmark_output so the frontend knows about it."""
+        # Manually add the model metadata to the schema.json, where the frontend expects it.
+        # TODO: Move model metadata out of schema.json into its own model_metadata.json file.
+        raw_schema = asdict_without_nones(self.schema)
+        raw_schema["models"] = self.get_model_field_dicts()
         write(
             os.path.join(self.run_release_path, "schema.json"),
-            json.dumps(asdict_without_nones(self.schema), indent=2, default=serialize_dates),
+            json.dumps(raw_schema, indent=2),
         )
     def read_runs(self):
@@ -450,136 +683,6 @@ class Summarizer:
         for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
             self.read_runs_for_suite(suite, run_suite_path)
-    def read_overlap_stats(self):
-        """
-        Load the overlap stats in the run suite path.
-        Concretely:
-            - get group -> scenario_spec information from self.runs
-                run_spec data
-            - read the files in the data_overlap directory in run_suite_path
-                which are scenario_spec -> overlap ids
-            - get aggregate stats for group -> overlap ratio
-        """
-        def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
-            scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
-            for run_spec in run_specs:
-                scenario_spec = run_spec.scenario_spec
-                groups = run_spec.groups
-                if (
-                    scenario_spec.class_name
-                    != "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
-                ):
-                    scenario_specs_to_groups[scenario_spec] = groups
-            group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
-            for scenario_spec, groups in scenario_specs_to_groups.items():
-                for group in groups:
-                    if group not in group_to_scenario_specs:
-                        group_to_scenario_specs[group] = []
-                    group_to_scenario_specs[group].append(scenario_spec)
-            return group_to_scenario_specs
-        def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
-            """
-            Takes the data_overlap_dir as input and returns a dictionary
-            of stats_file_path -> List(model_names)
-            Sample input:
-            file_models_mapping:
-            - file_name: file1
-                model_names:
-                - model1
-                - model2
-            - file_name: file2
-                model_names:
-                - model2
-                - model3
-            """
-            metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
-            if not os.path.exists(metadata_file_path):
-                return {}
-            with open(metadata_file_path, "r") as yaml_file:
-                data = yaml.safe_load(yaml_file)
-            file_metadata: Dict[str, List[str]] = {}
-            for entry in data["file_models_mapping"]:
-                if "file_name" in entry and "model_names" in entry:
-                    file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
-                    file_metadata[file_path] = entry["model_names"]
-            return file_metadata
-        self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
-        data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
-        if not os.path.isdir(data_overlap_dir):
-            hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
-            return
-        group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
-        stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
-        for file_path, model_names in stats_file_metadata.items():
-            overlap_stats_jsons = open(file_path, "r").readlines()
-            data_overlap_stats_list: List[DataOverlapStats] = []
-            for overlap_stats_json in overlap_stats_jsons:
-                overlap_stats_dict = json.loads(overlap_stats_json)
-                data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
-            scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
-            for data_overlap_stats in data_overlap_stats_list:
-                data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
-                n = data_overlap_stats_key.overlap_protocol_spec.n
-                if n == OVERLAP_N_COUNT:
-                    light_scenario_key = data_overlap_stats_key.light_scenario_key
-                    scenario_spec = light_scenario_key.scenario_spec
-                    if scenario_spec in self.scenario_spec_instance_id_dict:
-                        # Get statistics based on the subset of instance_ids that HELM uses for a scenario
-                        instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
-                        num_instances = len(instance_ids)
-                        num_overlapping_inputs = len(
-                            set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
-                        )
-                        num_overlapping_references = len(
-                            set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
-                        )
-                        scenario_spec_overlap_counts[scenario_spec] = (
-                            num_instances,
-                            num_overlapping_inputs,
-                            num_overlapping_references,
-                        )
-            for group, scenario_specs in group_to_scenario_specs.items():
-                group_num_instances = 0
-                group_num_overlapping_inputs = 0
-                group_num_overlapping_references = 0
-                for scenario_spec in scenario_specs:
-                    if scenario_spec in scenario_spec_overlap_counts:
-                        (
-                            num_instances,
-                            num_overlapping_inputs,
-                            num_overlapping_references,
-                        ) = scenario_spec_overlap_counts[scenario_spec]
-                        group_num_instances += num_instances
-                        group_num_overlapping_inputs += num_overlapping_inputs
-                        group_num_overlapping_references += num_overlapping_references
-                if group_num_instances != 0:
-                    group_overlap_stats = GroupOverlapStats(
-                        group=group,
-                        num_instances=group_num_instances,
-                        num_overlapping_inputs=group_num_overlapping_inputs,
-                        num_overlapping_references=group_num_overlapping_references,
-                    )
-                    for model_name in model_names:
-                        # Assume model name will only be associated with single group overlap list for now
-                        # can update to join lists if need arises
-                        self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
     @htrack(None)
     def check_metrics_defined(self):
         """Check that all the metrics that appear in stats are defined."""
@@ -593,8 +696,8 @@ class Summarizer:
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
-                hlog(
-                    f"WARNING: metric name {metric_name} undefined in {self.schema_file} "
+                hwarn(
+                    f"metric name {metric_name} undefined in {self.schema_path} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
@@ -726,9 +829,6 @@ class Summarizer:
                             num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
                             num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
-                if len(num_instances) == 0:
-                    continue
                 rows.append(
                     [
                         Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
@@ -762,9 +862,9 @@ class Summarizer:
         self,
         runs: List[Run],
         matcher: MetricNameMatcher,
-        contamination_level: Optional[str],
         additional_info: Optional[str],
         hide_value: bool = False,
+        is_scenario_table: bool = False,
     ) -> Cell:
         """
         Use the metric name identified by `matcher` to pull out the stats from
@@ -790,8 +890,8 @@ class Summarizer:
             if stat is None:
                 # Print out near misses to provide a more informative warning
                 near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
-                hlog(
-                    f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
+                hwarn(
+                    f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
                     f"{len(near_misses)} near misses matching just the name"
                 )
                 if len(near_misses) > 0:
@@ -818,23 +918,38 @@ class Summarizer:
         if self.verbose:
             description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
-        style: Dict[str, Any] = {}
-        if contamination_level is not None:
-            style = CONTAMINATION_STYLES.get(contamination_level, style)
-        return Cell(value=value, description=description, style=style, contamination_level=contamination_level)
+        # Link the runs that this cell was aggregated from, if this is not a scenario table.
+        # Scenario tables link to the runs in the model cells,
+        # whereas non-scenario tables link to the runs in the metrics cells.
+        run_spec_names: Optional[List] = None
+        if not is_scenario_table:
+            # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
+            # run specs if a run spec belongs to multiple groups.
+            run_spec_names = []
+            run_spec_names_set = set()
+            for run_spec_name in aggregated_run_specs:
+                if run_spec_name not in run_spec_names_set:
+                    run_spec_names.append(run_spec_name)
+                    run_spec_names_set.add(run_spec_name)
+        return Cell(
+            value=value,
+            description=description,
+            style={},
+            run_spec_names=run_spec_names,
+        )
     def create_group_table(
         self,
         name: str,
         title: str,
         adapter_to_runs: Dict[AdapterSpec, List[Run]],
-        link_to_runs: bool,
+        is_scenario_table: bool,
         columns: List[Tuple[RunGroup, str]],  # run_group, metric_group
         sort_by_model_order: bool = True,
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
-        add_win_rate: bool = False,
+        aggregation_strategies: List[str] = [],
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -847,7 +962,7 @@ class Summarizer:
         # Create header (cells to display) and the list of metric name filters
         # (to pull out information later).
         if not columns or not adapter_to_runs:
-            hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
+            hwarn(f"table {title}, has no rows or columns, leaving empty")
             return Table("empty", [], [])
         header: List[HeaderCell] = []
@@ -856,7 +971,7 @@ class Summarizer:
         num_groups = len(set(run_group.name for run_group, _ in columns))  # number of unique groups, determines headers
         # Column headers
-        header.append(HeaderCell("Model/adapter"))
+        header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
         for run_group, metric_group_name in columns:
             # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
             if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -868,7 +983,7 @@ class Summarizer:
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_file}, skipping")
+                    hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -876,7 +991,8 @@ class Summarizer:
                 }
                 header_name = header_field.get_short_display_name()
-                description = (run_group.description + "\n\n" if run_group.description is not None else "") + (
+                run_group_short_description = run_group.short_description or run_group.description or ""
+                description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
                     (header_field.display_name if header_field.display_name else header_field.name)
                     + ": "
                     + (header_field.description if header_field.description is not None else "")
@@ -921,10 +1037,10 @@ class Summarizer:
         adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
         if sort_by_model_order:
-            # Sort models by the order defined in the schema.
-            # Models not defined in the schema will be sorted alphabetically and
-            # placed before models in defined the schema.
-            model_order = [model.name for model in self.schema.models]
+            # Sort models by the order defined in the the model metadata config.
+            # Models not defined in the model metadata config will be sorted alphabetically and
+            # placed before models in defined the model metadata config.
+            model_order = get_all_models()
             def _adapter_spec_sort_key(spec):
                 index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
@@ -948,21 +1064,18 @@ class Summarizer:
             runs = adapter_to_runs[adapter_spec]
             display_name = get_method_display_name(model_metadata.display_name, info)
-            # Link to all the runs under this model
-            if link_to_runs:
+            # Link the runs that this row was aggregated from, if this is a scenario table.
+            # Scenario tables link to the runs in the model cells,
+            # whereas non-scenario tables link to the runs in the metrics cells.
+            run_spec_names: Optional[List[str]]
+            if is_scenario_table:
                 run_spec_names = [run.run_spec.name for run in runs]
                 href = run_spec_names_to_url(run_spec_names)
             else:
+                run_spec_names = None
                 href = None
-            # Render contamination information
-            point = self.contamination.get_point(model_name, columns[0][0].name)
-            if num_groups == 1 and point is not None:  # display contamination information at the adapter level
-                cells = [
-                    Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
-                ]
-            else:
-                cells = [Cell(display_name, description="", href=href)]
+            cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
             assert len(group_names) == len(matchers)
             for group_name, matcher in zip(group_names, matchers):
                 group_runs = [run for run in runs if group_name in run.run_spec.groups]
@@ -971,23 +1084,7 @@ class Summarizer:
                 if "babi" in group_name and "task:" not in name:
                     group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
-                point = self.contamination.get_point(model_name, group_name)
-                if point is not None:
-                    description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
-                    contamination_level = point.level
-                else:
-                    description = ""
-                    contamination_level = None
-                group_overlap_stats = None
-                if (model_name, group_name) in self._model_group_overlap_stats:
-                    group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
-                    description = (
-                        f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
-                        f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
-                        f"{description}"
-                    )
+                description = ""
                 # HACK: we want to hide stats for the following model-metric combinations:
                 # 1. Calibration metrics + AI21/Anthropic
@@ -999,9 +1096,9 @@ class Summarizer:
                     self.create_cell(
                         group_runs,
                         matcher,
-                        contamination_level,
                         additional_info=description,
                         hide_value=hide_value,
+                        is_scenario_table=is_scenario_table,
                     )
                 )
@@ -1011,12 +1108,12 @@ class Summarizer:
         # There could be a ton of runs, so only do this if there are 2-5
         # TODO: replace in frontend with a selector to choose which rows to visualize.
         links = []
-        if link_to_runs:
+        if is_scenario_table:
             all_run_spec_names = []
             for adapter_spec, runs in adapter_to_runs.items():
                 if len(runs) > 1:
-                    hlog(
-                        f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
+                    hwarn(
+                        f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
                         f" {[run.run_spec.name for run in runs]}"
                     )
                 for run in runs:
@@ -1026,21 +1123,42 @@ class Summarizer:
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
-        if add_win_rate:
-            # add overall win rate as the second column
-            WIN_RATE_AGGREGATION = "mean"
-            win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-            description = "How many models this model outperform on average (over columns)."
-            table.header.insert(
-                AGGREGATE_WIN_RATE_COLUMN,
-                HeaderCell(
-                    f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
-                    description=description,
-                    lower_is_better=False,
-                ),
-            )
-            for row, win_rate in zip(table.rows, win_rates):
-                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
+        aggregate_header_cells: List[HeaderCell] = []
+        aggregate_row_values: List[List[Optional[float]]] = []
+        for strategy in aggregation_strategies:
+            if strategy == AggregationStrategy.WIN_RATE:
+                WIN_RATE_AGGREGATION = "mean"
+                win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
+                aggregate_header_cells.append(
+                    HeaderCell(
+                        f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
+                        description="How many models this model outperforms on average (over columns).",
+                        lower_is_better=False,
+                    )
+                )
+                aggregate_row_values.append(win_rates)
+            elif strategy == AggregationStrategy.MEAN:
+                means = compute_aggregate_row_means(table)
+                aggregate_header_cells.append(
+                    HeaderCell(
+                        "Mean score",
+                        description="The mean of the scores from all columns.",
+                        lower_is_better=table.header[0].lower_is_better,
+                    )
+                )
+                aggregate_row_values.append(means)
+            else:
+                raise Exception(
+                    f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
+                )
+        for i in range(len(aggregate_header_cells)):
+            aggregate_header_cell = aggregate_header_cells[i]
+            aggregate_rows = aggregate_row_values[i]
+            table.header.insert(i + 1, aggregate_header_cell)
+            for row, row_val in zip(table.rows, aggregate_rows):
+                row.insert(i + 1, Cell(row_val))
         if bold_columns:
             for i, header_cell in enumerate(table.header):
@@ -1088,16 +1206,25 @@ class Summarizer:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
-                display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
+                metric_group_config = self.schema.name_to_metric_group[metric_group]
+                display_name = metric_group_config.get_short_display_name()
+                aggregate_strategies: List[str]
+                if metric_group_config.aggregation_strategies is not None:
+                    aggregate_strategies = metric_group_config.aggregation_strategies
+                elif metric_group_config.hide_win_rates:
+                    aggregate_strategies = []
+                else:
+                    aggregate_strategies = [AggregationStrategy.WIN_RATE]
                 table = self.create_group_table(
                     name=metric_group,
                     title=display_name,
                     adapter_to_runs=adapter_to_runs,
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
-                    link_to_runs=False,
-                    add_win_rate=True,
+                    is_scenario_table=False,
+                    aggregation_strategies=aggregate_strategies,
                 )
-                tables.append(table)
+                if len(table.header) > 1:
+                    tables.append(table)
         return tables
     def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1126,7 +1253,7 @@ class Summarizer:
                         name=scenario_name,
                         adapter_to_runs=adapter_to_runs,
                         columns=columns,
-                        link_to_runs=True,
+                        is_scenario_table=True,
                     )
                     tables.append(table)
                     scenarios_shown += 1
@@ -1138,7 +1265,7 @@ class Summarizer:
                                 name=f"{subgroup.name}:sub_split={sub_split}",
                                 adapter_to_runs=adapter_to_runs,
                                 columns=columns,
-                                link_to_runs=False,
+                                is_scenario_table=False,
                                 sub_split=sub_split,
                             )
                             tables.append(table)
@@ -1158,7 +1285,7 @@ class Summarizer:
                         name=subgroup.name,
                         adapter_to_runs=adapter_to_runs,
                         columns=columns,
-                        link_to_runs=False,
+                        is_scenario_table=False,
                     )
                     tables = [table] + tables
             all_tables.extend(tables)
@@ -1225,72 +1352,6 @@ class Summarizer:
         parallel_map(process, self.runs, parallelism=self.num_threads)
-    def read_scenario_spec_instance_ids(self, num_instances) -> None:
-        """
-        This file checks if there exists a file, scenario_spec_instance_ids.json
-        that it can read the instance_ids associated with scenario_specs.
-        It will write the num_instances used in the run as part of the file name
-        If it doesn't exist, it will go through all the scenario_state files
-        and parse the instance_ids and output it to the file for future uses
-        Only when the scenario_specs for the data overlap script change
-        (or num_instances are different), will this need to be rerun.
-        In such cases, do not include the file as part of the data_overlap directory.
-        """
-        self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
-        data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
-        if not os.path.isdir(data_overlap_dir):
-            hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
-            return
-        scenario_spec_instance_ids_json = os.path.join(
-            data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
-        )
-        if not os.path.exists(scenario_spec_instance_ids_json):
-            hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
-            self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
-        else:
-            hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
-            scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
-            for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
-                scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
-                scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
-                self.scenario_spec_instance_id_dict[
-                    scenario_spec_instance_ids.scenario_spec
-                ] = scenario_spec_instance_ids.instance_ids
-    def write_scenario_spec_instance_ids_json(self, file_path) -> None:
-        for run in self.runs:
-            run_spec = run.run_spec
-            scenario_spec = run_spec.scenario_spec
-            if scenario_spec in self.scenario_spec_instance_id_dict:
-                continue
-            run_path = run.run_path
-            instances_file_path = os.path.join(run_path, "instances.json")
-            with open(instances_file_path, "r") as f:
-                raw_instances = json.load(f)
-            # Optimization: Don't structure to dataclass, since we only need to read `id`
-            instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
-            self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
-        all_scenario_spec_instance_ids = []
-        for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
-            scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
-            all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
-        with open(file_path, "w") as f:
-            f.writelines(
-                f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
-                for scenario_spec_instance_ids in all_scenario_spec_instance_ids
-            )
     def symlink_latest(self) -> None:
         # Create a symlink runs/latest -> runs/<name_of_suite>,
         # so runs/latest always points to the latest run suite.
@@ -1302,24 +1363,20 @@ class Summarizer:
             os.unlink(symlink_path)
         os.symlink(os.path.basename(self.run_release_path), symlink_path)
-    def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
+    def run_pipeline(self, skip_completed: bool) -> None:
         """Run the entire summarization pipeline."""
-        self.write_schema()
         self.read_runs()
         self.group_runs()
-        self.check_metrics_defined()
-        self.write_run_display_json(skip_completed)
-        # Must happen after summarizer.write_run_display_json()
-        # because it uses instances.json files
-        self.read_scenario_spec_instance_ids(num_instances)
+        ensure_directory_exists(self.run_release_path)
-        # Must happen after summarizer.read_scenario_spec_instance_ids()
-        # because it uses self.scenario_spec_instance_id_dict
-        self.read_overlap_stats()
+        # Must happen after self.read_runs()
+        # because it uses self.runs
+        self.fix_up_schema()
+        self.check_metrics_defined()
+        self.write_schema()
+        self.write_run_display_json(skip_completed)
         self.write_executive_summary()
         self.write_runs()
         self.write_run_specs()
@@ -1331,16 +1388,70 @@ class Summarizer:
 @htrack("summarize")
+def summarize(args):
+    release: Optional[str] = None
+    suites: Optional[str] = None
+    suite: Optional[str] = None
+    if args.suite and (args.release or args.suites):
+        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
+    elif args.suite:
+        # Comment this out while we have a trial period for the `release` method.
+        # hlog(
+        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
+        #     "where --release specifies the name of a release and --suites specifies several run suites "
+        #     "to be included in that release."
+        # )
+        suite = args.suite
+    elif args.release or args.suites:
+        if not args.release or not args.suites:
+            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
+        release = args.release
+        suites = args.suites
+    else:
+        raise ValueError("Exactly one of --release or --suite must be specified.")
+    schema_path: Optional[str]
+    if args.auto_generate_schema:
+        if args.schema_path:
+            raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
+        schema_path = None
+    elif args.schema_path:
+        schema_path = args.schema_path
+    else:
+        schema_path = get_default_schema_path()
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
+    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
+    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
+    summarizer = summarizer_cls(
+        release=release,
+        suites=suites,
+        suite=suite,
+        schema_path=schema_path,
+        output_path=args.output_path,
+        verbose=args.debug,
+        num_threads=args.num_threads,
+        allow_unknown_models=args.allow_unknown_models,
+    )
+    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
+    hlog("Done.")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
+        "-o",
+        "--output-path",
+        type=str,
+        help="Where the benchmarking output lives",
+        default="benchmark_output",
     )
     parser.add_argument(
-        "--schema-file",
+        "--schema-path",
         type=str,
-        help="File name of the schema to read (e.g., schema_classic.yaml).",
-        default=SCHEMA_CLASSIC_YAML_FILENAME,
+        help="Path to the schema file (e.g., schema_classic.yaml).",
     )
     parser.add_argument(
         "--suite",
@@ -1353,9 +1464,18 @@ def main():
         help="Experimental: Name of the release this summarization should go under.",
     )
     parser.add_argument(
-        "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
+        "--suites",
+        type=str,
+        nargs="+",
+        help="Experimental: List of suites to summarize for this this release.",
+    )
+    parser.add_argument(
+        "-n",
+        "--num-threads",
+        type=int,
+        help="Max number of threads used to summarize",
+        default=8,
     )
-    parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -1366,12 +1486,6 @@ def main():
         action="store_true",
         help="Skip write_run_display_json() for runs which already have all output display JSON files",
     )
-    parser.add_argument(
-        "-num-instances",
-        type=int,
-        help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
-        default=1000,
-    )
     parser.add_argument(
         "--local-path",
         type=str,
@@ -1384,45 +1498,26 @@ def main():
         help="Whether to allow unknown models in the metadata file",
         default=True,
     )
-    args = parser.parse_args()
-    release: Optional[str] = None
-    suites: Optional[str] = None
-    suite: Optional[str] = None
-    if args.suite and (args.release or args.suites):
-        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
-    elif args.suite:
-        # Comment this out while we have a trial period for the `release` method.
-        # hlog(
-        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
-        #     "where --release specifies the name of a release and --suites specifies several run suites "
-        #     "to be included in that release."
-        # )
-        suite = args.suite
-    elif args.release or args.suites:
-        if not args.release or not args.suites:
-            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
-        release = args.release
-        suites = args.suites
-    else:
-        raise ValueError("Exactly one of --release or --suite must be specified.")
-    register_builtin_configs_from_helm_package()
-    register_configs_from_directory(args.local_path)
-    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
-    summarizer = Summarizer(
-        release=release,
-        suites=suites,
-        suite=suite,
-        schema_file=args.schema_file,
-        output_path=args.output_path,
-        verbose=args.debug,
-        num_threads=args.num_threads,
-        allow_unknown_models=args.allow_unknown_models,
+    parser.add_argument(
+        "--summarizer-class-name",
+        type=str,
+        default=None,
+        help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
     )
-    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
-    hlog("Done.")
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
+    parser.add_argument(
+        "--auto-generate-schema",
+        action="store_true",
+        help="EXPERIMENTAL: Auto-generate schema",
+    )
+    args = parser.parse_args()
+    setup_default_logging(args.log_config)
+    summarize(args)
 if __name__ == "__main__":

crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl