crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -9,12 +9,10 @@ Usage:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
-
import cattrs
|
|
13
12
|
import os
|
|
14
13
|
import datetime
|
|
15
14
|
import urllib.parse
|
|
16
15
|
import json
|
|
17
|
-
import yaml
|
|
18
16
|
from collections import defaultdict
|
|
19
17
|
from dataclasses import dataclass, replace
|
|
20
18
|
from statistics import mean, median
|
|
@@ -35,8 +33,6 @@ from helm.common.codec import from_json
|
|
|
35
33
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
36
34
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
37
35
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
38
|
-
from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
|
|
39
|
-
from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
|
|
40
36
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
41
37
|
from helm.benchmark.metrics.metric import get_all_stats_by_name
|
|
42
38
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
@@ -56,9 +52,10 @@ from helm.benchmark.presentation.schema import (
|
|
|
56
52
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
|
|
57
53
|
from helm.benchmark.presentation.run_display import write_run_display_json
|
|
58
54
|
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
55
|
+
from helm.common.object_spec import get_class_by_name
|
|
59
56
|
|
|
60
57
|
|
|
61
|
-
|
|
58
|
+
MODEL_HEADER_CELL_VALUE = "Model"
|
|
62
59
|
|
|
63
60
|
|
|
64
61
|
@dataclass(frozen=True)
|
|
@@ -226,17 +223,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
226
223
|
"""
|
|
227
224
|
assert aggregation in ["mean", "median"]
|
|
228
225
|
win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
|
|
229
|
-
for
|
|
226
|
+
for column_index, header_cell in enumerate(table.header):
|
|
230
227
|
lower_is_better = header_cell.lower_is_better
|
|
231
228
|
if lower_is_better is None: # column does not have a meaningful ordering
|
|
232
229
|
continue
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
230
|
+
value_to_count: Dict[float, int] = defaultdict(int)
|
|
231
|
+
for row in table.rows:
|
|
232
|
+
value = row[column_index].value
|
|
233
|
+
if value is not None:
|
|
234
|
+
value_to_count[value] += 1
|
|
235
|
+
value_to_wins: Dict[float, float] = {}
|
|
236
|
+
acc_count = 0
|
|
237
|
+
for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
|
|
238
|
+
value_to_wins[value] = acc_count + ((value_count - 1) / 2)
|
|
239
|
+
acc_count += value_count
|
|
240
|
+
total_count = acc_count
|
|
241
|
+
if total_count < 2:
|
|
236
242
|
continue
|
|
237
|
-
for
|
|
238
|
-
|
|
239
|
-
|
|
243
|
+
for row_index, row in enumerate(table.rows):
|
|
244
|
+
value = row[column_index].value
|
|
245
|
+
if value is not None:
|
|
246
|
+
win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
|
|
240
247
|
|
|
241
248
|
# Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
|
|
242
249
|
# Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
|
|
@@ -251,7 +258,48 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
251
258
|
return aggregate_win_rates
|
|
252
259
|
|
|
253
260
|
|
|
254
|
-
|
|
261
|
+
def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
262
|
+
"""
|
|
263
|
+
Computes the aggregate mean of each row across columns.
|
|
264
|
+
Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
|
|
265
|
+
non-null values of the row are in columns we skip).
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
row_means: List[Optional[float]] = []
|
|
269
|
+
# if the first column contains the names of models, do not treat it like a value column
|
|
270
|
+
skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
|
|
271
|
+
|
|
272
|
+
# check for all header cells where specified, that lower_is_better is consistent
|
|
273
|
+
orderings = []
|
|
274
|
+
header_cells = table.header[1:] if skip_first_column else table.header
|
|
275
|
+
for header_cell in header_cells:
|
|
276
|
+
orderings.append(header_cell.lower_is_better)
|
|
277
|
+
if len(set(orderings)) != 1:
|
|
278
|
+
raise Exception("Cannot mean columns with different values for lower_is_better")
|
|
279
|
+
|
|
280
|
+
for row in table.rows:
|
|
281
|
+
total = 0.0
|
|
282
|
+
count = 0
|
|
283
|
+
row_cells = row[1:] if skip_first_column else row
|
|
284
|
+
for cell in row_cells:
|
|
285
|
+
if cell.value is not None:
|
|
286
|
+
total += float(cell.value)
|
|
287
|
+
count += 1
|
|
288
|
+
if count == 0:
|
|
289
|
+
row_means.append(None)
|
|
290
|
+
else:
|
|
291
|
+
row_means.append(total / count)
|
|
292
|
+
|
|
293
|
+
return row_means
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class AggregationStrategy:
|
|
297
|
+
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
298
|
+
WIN_RATE = "win_rate"
|
|
299
|
+
MEAN = "mean"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
|
|
255
303
|
|
|
256
304
|
|
|
257
305
|
class Summarizer:
|
|
@@ -483,137 +531,6 @@ class Summarizer:
|
|
|
483
531
|
for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
|
|
484
532
|
self.read_runs_for_suite(suite, run_suite_path)
|
|
485
533
|
|
|
486
|
-
def read_overlap_stats(self):
|
|
487
|
-
"""
|
|
488
|
-
Load the overlap stats in the run suite path.
|
|
489
|
-
Concretely:
|
|
490
|
-
- get group -> scenario_spec information from self.runs
|
|
491
|
-
run_spec data
|
|
492
|
-
- read the files in the data_overlap directory in run_suite_path
|
|
493
|
-
which are scenario_spec -> overlap ids
|
|
494
|
-
- get aggregate stats for group -> overlap ratio
|
|
495
|
-
"""
|
|
496
|
-
|
|
497
|
-
def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
|
|
498
|
-
scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
|
|
499
|
-
for run_spec in run_specs:
|
|
500
|
-
scenario_spec = run_spec.scenario_spec
|
|
501
|
-
groups = run_spec.groups
|
|
502
|
-
if (
|
|
503
|
-
scenario_spec.class_name
|
|
504
|
-
!= "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
|
|
505
|
-
):
|
|
506
|
-
scenario_specs_to_groups[scenario_spec] = groups
|
|
507
|
-
|
|
508
|
-
group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
|
|
509
|
-
for scenario_spec, groups in scenario_specs_to_groups.items():
|
|
510
|
-
for group in groups:
|
|
511
|
-
if group not in group_to_scenario_specs:
|
|
512
|
-
group_to_scenario_specs[group] = []
|
|
513
|
-
group_to_scenario_specs[group].append(scenario_spec)
|
|
514
|
-
return group_to_scenario_specs
|
|
515
|
-
|
|
516
|
-
def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
|
|
517
|
-
"""
|
|
518
|
-
Takes the data_overlap_dir as input and returns a dictionary
|
|
519
|
-
of stats_file_path -> List(model_names)
|
|
520
|
-
|
|
521
|
-
Sample input:
|
|
522
|
-
file_models_mapping:
|
|
523
|
-
- file_name: file1
|
|
524
|
-
model_names:
|
|
525
|
-
- model1
|
|
526
|
-
- model2
|
|
527
|
-
- file_name: file2
|
|
528
|
-
model_names:
|
|
529
|
-
- model2
|
|
530
|
-
- model3
|
|
531
|
-
|
|
532
|
-
"""
|
|
533
|
-
metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
|
|
534
|
-
if not os.path.exists(metadata_file_path):
|
|
535
|
-
return {}
|
|
536
|
-
|
|
537
|
-
with open(metadata_file_path, "r") as yaml_file:
|
|
538
|
-
data = yaml.safe_load(yaml_file)
|
|
539
|
-
|
|
540
|
-
file_metadata: Dict[str, List[str]] = {}
|
|
541
|
-
for entry in data["file_models_mapping"]:
|
|
542
|
-
if "file_name" in entry and "model_names" in entry:
|
|
543
|
-
file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
|
|
544
|
-
file_metadata[file_path] = entry["model_names"]
|
|
545
|
-
|
|
546
|
-
return file_metadata
|
|
547
|
-
|
|
548
|
-
# TODO: Delete this after @andyzorigin's project is done.
|
|
549
|
-
self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
|
|
550
|
-
|
|
551
|
-
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
552
|
-
if not os.path.isdir(data_overlap_dir):
|
|
553
|
-
hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
|
|
554
|
-
return
|
|
555
|
-
|
|
556
|
-
group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
|
|
557
|
-
|
|
558
|
-
stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
|
|
559
|
-
|
|
560
|
-
for file_path, model_names in stats_file_metadata.items():
|
|
561
|
-
overlap_stats_jsons = open(file_path, "r").readlines()
|
|
562
|
-
|
|
563
|
-
data_overlap_stats_list: List[DataOverlapStats] = []
|
|
564
|
-
for overlap_stats_json in overlap_stats_jsons:
|
|
565
|
-
overlap_stats_dict = json.loads(overlap_stats_json)
|
|
566
|
-
data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
|
|
567
|
-
|
|
568
|
-
scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
|
|
569
|
-
for data_overlap_stats in data_overlap_stats_list:
|
|
570
|
-
data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
|
|
571
|
-
n = data_overlap_stats_key.overlap_protocol_spec.n
|
|
572
|
-
if n == OVERLAP_N_COUNT:
|
|
573
|
-
light_scenario_key = data_overlap_stats_key.light_scenario_key
|
|
574
|
-
scenario_spec = light_scenario_key.scenario_spec
|
|
575
|
-
if scenario_spec in self.scenario_spec_instance_id_dict:
|
|
576
|
-
# Get statistics based on the subset of instance_ids that HELM uses for a scenario
|
|
577
|
-
instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
|
|
578
|
-
num_instances = len(instance_ids)
|
|
579
|
-
num_overlapping_inputs = len(
|
|
580
|
-
set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
|
|
581
|
-
)
|
|
582
|
-
num_overlapping_references = len(
|
|
583
|
-
set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
|
|
584
|
-
)
|
|
585
|
-
scenario_spec_overlap_counts[scenario_spec] = (
|
|
586
|
-
num_instances,
|
|
587
|
-
num_overlapping_inputs,
|
|
588
|
-
num_overlapping_references,
|
|
589
|
-
)
|
|
590
|
-
|
|
591
|
-
for group, scenario_specs in group_to_scenario_specs.items():
|
|
592
|
-
group_num_instances = 0
|
|
593
|
-
group_num_overlapping_inputs = 0
|
|
594
|
-
group_num_overlapping_references = 0
|
|
595
|
-
for scenario_spec in scenario_specs:
|
|
596
|
-
if scenario_spec in scenario_spec_overlap_counts:
|
|
597
|
-
(
|
|
598
|
-
num_instances,
|
|
599
|
-
num_overlapping_inputs,
|
|
600
|
-
num_overlapping_references,
|
|
601
|
-
) = scenario_spec_overlap_counts[scenario_spec]
|
|
602
|
-
group_num_instances += num_instances
|
|
603
|
-
group_num_overlapping_inputs += num_overlapping_inputs
|
|
604
|
-
group_num_overlapping_references += num_overlapping_references
|
|
605
|
-
if group_num_instances != 0:
|
|
606
|
-
group_overlap_stats = GroupOverlapStats(
|
|
607
|
-
group=group,
|
|
608
|
-
num_instances=group_num_instances,
|
|
609
|
-
num_overlapping_inputs=group_num_overlapping_inputs,
|
|
610
|
-
num_overlapping_references=group_num_overlapping_references,
|
|
611
|
-
)
|
|
612
|
-
for model_name in model_names:
|
|
613
|
-
# Assume model name will only be associated with single group overlap list for now
|
|
614
|
-
# can update to join lists if need arises
|
|
615
|
-
self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
|
|
616
|
-
|
|
617
534
|
@htrack(None)
|
|
618
535
|
def check_metrics_defined(self):
|
|
619
536
|
"""Check that all the metrics that appear in stats are defined."""
|
|
@@ -880,7 +797,7 @@ class Summarizer:
|
|
|
880
797
|
sort_by_model_order: bool = True,
|
|
881
798
|
sub_split: Optional[str] = None,
|
|
882
799
|
bold_columns: bool = True,
|
|
883
|
-
|
|
800
|
+
aggregation_strategies: List[str] = [],
|
|
884
801
|
) -> Table:
|
|
885
802
|
"""
|
|
886
803
|
Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
|
|
@@ -902,7 +819,7 @@ class Summarizer:
|
|
|
902
819
|
num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
|
|
903
820
|
|
|
904
821
|
# Column headers
|
|
905
|
-
header.append(HeaderCell(
|
|
822
|
+
header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
|
|
906
823
|
for run_group, metric_group_name in columns:
|
|
907
824
|
# check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
|
|
908
825
|
if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
|
|
@@ -1016,16 +933,6 @@ class Summarizer:
|
|
|
1016
933
|
|
|
1017
934
|
description = ""
|
|
1018
935
|
|
|
1019
|
-
group_overlap_stats = None
|
|
1020
|
-
if (model_name, group_name) in self._model_group_overlap_stats:
|
|
1021
|
-
group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
|
|
1022
|
-
|
|
1023
|
-
description = (
|
|
1024
|
-
f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
|
|
1025
|
-
f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
|
|
1026
|
-
f"{description}"
|
|
1027
|
-
)
|
|
1028
|
-
|
|
1029
936
|
# HACK: we want to hide stats for the following model-metric combinations:
|
|
1030
937
|
# 1. Calibration metrics + AI21/Anthropic
|
|
1031
938
|
# 2. MSMARCO metrics + AI21/Anthropic
|
|
@@ -1063,21 +970,42 @@ class Summarizer:
|
|
|
1063
970
|
|
|
1064
971
|
table = Table(title=title, header=header, rows=rows, links=links, name=name)
|
|
1065
972
|
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
973
|
+
aggregate_header_cells: List[HeaderCell] = []
|
|
974
|
+
aggregate_row_values: List[List[Optional[float]]] = []
|
|
975
|
+
|
|
976
|
+
for strategy in aggregation_strategies:
|
|
977
|
+
if strategy == AggregationStrategy.WIN_RATE:
|
|
978
|
+
WIN_RATE_AGGREGATION = "mean"
|
|
979
|
+
win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
|
|
980
|
+
aggregate_header_cells.append(
|
|
981
|
+
HeaderCell(
|
|
982
|
+
f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
|
|
983
|
+
description="How many models this model outperforms on average (over columns).",
|
|
984
|
+
lower_is_better=False,
|
|
985
|
+
)
|
|
986
|
+
)
|
|
987
|
+
aggregate_row_values.append(win_rates)
|
|
988
|
+
elif strategy == AggregationStrategy.MEAN:
|
|
989
|
+
means = compute_aggregate_row_means(table)
|
|
990
|
+
aggregate_header_cells.append(
|
|
991
|
+
HeaderCell(
|
|
992
|
+
"Mean score",
|
|
993
|
+
description="The mean of the scores from all columns.",
|
|
994
|
+
lower_is_better=table.header[0].lower_is_better,
|
|
995
|
+
)
|
|
996
|
+
)
|
|
997
|
+
aggregate_row_values.append(means)
|
|
998
|
+
else:
|
|
999
|
+
raise Exception(
|
|
1000
|
+
f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
for i in range(len(aggregate_header_cells)):
|
|
1004
|
+
aggregate_header_cell = aggregate_header_cells[i]
|
|
1005
|
+
aggregate_rows = aggregate_row_values[i]
|
|
1006
|
+
table.header.insert(i + 1, aggregate_header_cell)
|
|
1007
|
+
for row, row_val in zip(table.rows, aggregate_rows):
|
|
1008
|
+
row.insert(i + 1, Cell(row_val))
|
|
1081
1009
|
|
|
1082
1010
|
if bold_columns:
|
|
1083
1011
|
for i, header_cell in enumerate(table.header):
|
|
@@ -1125,14 +1053,22 @@ class Summarizer:
|
|
|
1125
1053
|
|
|
1126
1054
|
if len(adapter_to_runs) > 0:
|
|
1127
1055
|
for metric_group in all_metric_groups:
|
|
1128
|
-
|
|
1056
|
+
metric_group_config = self.schema.name_to_metric_group[metric_group]
|
|
1057
|
+
display_name = metric_group_config.get_short_display_name()
|
|
1058
|
+
aggregate_strategies: List[str]
|
|
1059
|
+
if metric_group_config.aggregation_strategies is not None:
|
|
1060
|
+
aggregate_strategies = metric_group_config.aggregation_strategies
|
|
1061
|
+
elif metric_group_config.hide_win_rates:
|
|
1062
|
+
aggregate_strategies = []
|
|
1063
|
+
else:
|
|
1064
|
+
aggregate_strategies = [AggregationStrategy.WIN_RATE]
|
|
1129
1065
|
table = self.create_group_table(
|
|
1130
1066
|
name=metric_group,
|
|
1131
1067
|
title=display_name,
|
|
1132
1068
|
adapter_to_runs=adapter_to_runs,
|
|
1133
1069
|
columns=[(subgroup, metric_group) for subgroup in subgroups],
|
|
1134
1070
|
is_scenario_table=False,
|
|
1135
|
-
|
|
1071
|
+
aggregation_strategies=aggregate_strategies,
|
|
1136
1072
|
)
|
|
1137
1073
|
tables.append(table)
|
|
1138
1074
|
return tables
|
|
@@ -1262,72 +1198,6 @@ class Summarizer:
|
|
|
1262
1198
|
|
|
1263
1199
|
parallel_map(process, self.runs, parallelism=self.num_threads)
|
|
1264
1200
|
|
|
1265
|
-
def read_scenario_spec_instance_ids(self, num_instances) -> None:
|
|
1266
|
-
"""
|
|
1267
|
-
This file checks if there exists a file, scenario_spec_instance_ids.json
|
|
1268
|
-
that it can read the instance_ids associated with scenario_specs.
|
|
1269
|
-
|
|
1270
|
-
It will write the num_instances used in the run as part of the file name
|
|
1271
|
-
|
|
1272
|
-
If it doesn't exist, it will go through all the scenario_state files
|
|
1273
|
-
and parse the instance_ids and output it to the file for future uses
|
|
1274
|
-
|
|
1275
|
-
Only when the scenario_specs for the data overlap script change
|
|
1276
|
-
(or num_instances are different), will this need to be rerun.
|
|
1277
|
-
|
|
1278
|
-
In such cases, do not include the file as part of the data_overlap directory.
|
|
1279
|
-
"""
|
|
1280
|
-
self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
|
|
1281
|
-
|
|
1282
|
-
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
1283
|
-
if not os.path.isdir(data_overlap_dir):
|
|
1284
|
-
hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
|
|
1285
|
-
return
|
|
1286
|
-
|
|
1287
|
-
scenario_spec_instance_ids_json = os.path.join(
|
|
1288
|
-
data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
|
|
1289
|
-
)
|
|
1290
|
-
if not os.path.exists(scenario_spec_instance_ids_json):
|
|
1291
|
-
hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
|
|
1292
|
-
self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
|
|
1293
|
-
else:
|
|
1294
|
-
hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
|
|
1295
|
-
scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
|
|
1296
|
-
|
|
1297
|
-
for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
|
|
1298
|
-
scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
|
|
1299
|
-
scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
|
|
1300
|
-
self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
|
|
1301
|
-
scenario_spec_instance_ids.instance_ids
|
|
1302
|
-
)
|
|
1303
|
-
|
|
1304
|
-
def write_scenario_spec_instance_ids_json(self, file_path) -> None:
|
|
1305
|
-
for run in self.runs:
|
|
1306
|
-
run_spec = run.run_spec
|
|
1307
|
-
scenario_spec = run_spec.scenario_spec
|
|
1308
|
-
if scenario_spec in self.scenario_spec_instance_id_dict:
|
|
1309
|
-
continue
|
|
1310
|
-
|
|
1311
|
-
run_path = run.run_path
|
|
1312
|
-
instances_file_path = os.path.join(run_path, "instances.json")
|
|
1313
|
-
with open(instances_file_path, "r") as f:
|
|
1314
|
-
raw_instances = json.load(f)
|
|
1315
|
-
|
|
1316
|
-
# Optimization: Don't structure to dataclass, since we only need to read `id`
|
|
1317
|
-
instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
|
|
1318
|
-
self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
|
|
1319
|
-
|
|
1320
|
-
all_scenario_spec_instance_ids = []
|
|
1321
|
-
for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
|
|
1322
|
-
scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
|
|
1323
|
-
all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
|
|
1324
|
-
|
|
1325
|
-
with open(file_path, "w") as f:
|
|
1326
|
-
f.writelines(
|
|
1327
|
-
f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
|
|
1328
|
-
for scenario_spec_instance_ids in all_scenario_spec_instance_ids
|
|
1329
|
-
)
|
|
1330
|
-
|
|
1331
1201
|
def symlink_latest(self) -> None:
|
|
1332
1202
|
# Create a symlink runs/latest -> runs/<name_of_suite>,
|
|
1333
1203
|
# so runs/latest always points to the latest run suite.
|
|
@@ -1339,7 +1209,7 @@ class Summarizer:
|
|
|
1339
1209
|
os.unlink(symlink_path)
|
|
1340
1210
|
os.symlink(os.path.basename(self.run_release_path), symlink_path)
|
|
1341
1211
|
|
|
1342
|
-
def run_pipeline(self, skip_completed: bool
|
|
1212
|
+
def run_pipeline(self, skip_completed: bool) -> None:
|
|
1343
1213
|
"""Run the entire summarization pipeline."""
|
|
1344
1214
|
self.read_runs()
|
|
1345
1215
|
self.group_runs()
|
|
@@ -1347,14 +1217,6 @@ class Summarizer:
|
|
|
1347
1217
|
|
|
1348
1218
|
self.write_run_display_json(skip_completed)
|
|
1349
1219
|
|
|
1350
|
-
# Must happen after summarizer.write_run_display_json()
|
|
1351
|
-
# because it uses instances.json files
|
|
1352
|
-
self.read_scenario_spec_instance_ids(num_instances)
|
|
1353
|
-
|
|
1354
|
-
# Must happen after summarizer.read_scenario_spec_instance_ids()
|
|
1355
|
-
# because it uses self.scenario_spec_instance_id_dict
|
|
1356
|
-
self.read_overlap_stats()
|
|
1357
|
-
|
|
1358
1220
|
# Must happen after self.read_runs()
|
|
1359
1221
|
# because it uses self.runs
|
|
1360
1222
|
self.write_schema()
|
|
@@ -1404,12 +1266,6 @@ def main():
|
|
|
1404
1266
|
action="store_true",
|
|
1405
1267
|
help="Skip write_run_display_json() for runs which already have all output display JSON files",
|
|
1406
1268
|
)
|
|
1407
|
-
parser.add_argument(
|
|
1408
|
-
"-num-instances",
|
|
1409
|
-
type=int,
|
|
1410
|
-
help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
|
|
1411
|
-
default=1000,
|
|
1412
|
-
)
|
|
1413
1269
|
parser.add_argument(
|
|
1414
1270
|
"--local-path",
|
|
1415
1271
|
type=str,
|
|
@@ -1422,6 +1278,12 @@ def main():
|
|
|
1422
1278
|
help="Whether to allow unknown models in the metadata file",
|
|
1423
1279
|
default=True,
|
|
1424
1280
|
)
|
|
1281
|
+
parser.add_argument(
|
|
1282
|
+
"--summarizer-class-name",
|
|
1283
|
+
type=str,
|
|
1284
|
+
default=None,
|
|
1285
|
+
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1286
|
+
)
|
|
1425
1287
|
args = parser.parse_args()
|
|
1426
1288
|
|
|
1427
1289
|
release: Optional[str] = None
|
|
@@ -1451,7 +1313,8 @@ def main():
|
|
|
1451
1313
|
register_configs_from_directory(args.local_path)
|
|
1452
1314
|
|
|
1453
1315
|
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1454
|
-
|
|
1316
|
+
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1317
|
+
summarizer = summarizer_cls(
|
|
1455
1318
|
release=release,
|
|
1456
1319
|
suites=suites,
|
|
1457
1320
|
suite=suite,
|
|
@@ -1461,7 +1324,7 @@ def main():
|
|
|
1461
1324
|
num_threads=args.num_threads,
|
|
1462
1325
|
allow_unknown_models=args.allow_unknown_models,
|
|
1463
1326
|
)
|
|
1464
|
-
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json
|
|
1327
|
+
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1465
1328
|
hlog("Done.")
|
|
1466
1329
|
|
|
1467
1330
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
from helm.benchmark.presentation.summarize import Summarizer
|
|
4
|
+
from helm.benchmark.presentation.summarize import Summarizer, compute_aggregate_row_win_rates
|
|
5
5
|
from helm.benchmark.presentation.schema import get_default_schema_path
|
|
6
|
+
from helm.benchmark.presentation.table import Cell, HeaderCell, Table
|
|
6
7
|
from helm.common.general import ensure_directory_exists
|
|
7
8
|
|
|
8
9
|
|
|
@@ -19,7 +20,7 @@ def test_summarize_suite():
|
|
|
19
20
|
num_threads=4,
|
|
20
21
|
allow_unknown_models=True,
|
|
21
22
|
)
|
|
22
|
-
summarizer.run_pipeline(skip_completed=True
|
|
23
|
+
summarizer.run_pipeline(skip_completed=True)
|
|
23
24
|
assert os.path.isfile(os.path.join(output_path, "runs", "test_suite", "groups.json"))
|
|
24
25
|
|
|
25
26
|
|
|
@@ -37,5 +38,146 @@ def test_summarize_release():
|
|
|
37
38
|
num_threads=4,
|
|
38
39
|
allow_unknown_models=True,
|
|
39
40
|
)
|
|
40
|
-
summarizer.run_pipeline(skip_completed=True
|
|
41
|
+
summarizer.run_pipeline(skip_completed=True)
|
|
41
42
|
assert os.path.isfile(os.path.join(output_path, "releases", "test_release", "groups.json"))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_compute_win_rates_one_scenario():
|
|
46
|
+
header = [
|
|
47
|
+
HeaderCell(value="Model"),
|
|
48
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
49
|
+
]
|
|
50
|
+
values = [
|
|
51
|
+
["Model A", 1],
|
|
52
|
+
["Model B", 2],
|
|
53
|
+
["Model C", 3],
|
|
54
|
+
["Model D", 4],
|
|
55
|
+
["Model E", 5],
|
|
56
|
+
]
|
|
57
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
58
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
59
|
+
assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_compute_win_rates_two_scenarios():
|
|
63
|
+
header = [
|
|
64
|
+
HeaderCell(value="Model"),
|
|
65
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
66
|
+
HeaderCell(value="Scenario B", lower_is_better=False),
|
|
67
|
+
]
|
|
68
|
+
values = [
|
|
69
|
+
["Model A", 1, 3],
|
|
70
|
+
["Model B", 2, 1],
|
|
71
|
+
["Model C", 3, 2],
|
|
72
|
+
["Model D", 4, 5],
|
|
73
|
+
["Model E", 5, 4],
|
|
74
|
+
]
|
|
75
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
76
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
77
|
+
assert compute_aggregate_row_win_rates(table) == [0.25, 0.125, 0.375, 0.875, 0.875]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_compute_win_rates_incomplete_values():
|
|
81
|
+
header = [
|
|
82
|
+
HeaderCell(value="Model"),
|
|
83
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
84
|
+
HeaderCell(value="Scenario B", lower_is_better=False),
|
|
85
|
+
]
|
|
86
|
+
values = [
|
|
87
|
+
["Model A", 1, 3],
|
|
88
|
+
["Model B", 2, 1],
|
|
89
|
+
["Model C", 3, None],
|
|
90
|
+
["Model D", 4, None],
|
|
91
|
+
["Model E", 5, None],
|
|
92
|
+
]
|
|
93
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
94
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
95
|
+
assert compute_aggregate_row_win_rates(table) == [0.5, 0.125, 0.5, 0.75, 1]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_compute_win_rates_ignore_nones():
|
|
99
|
+
header = [
|
|
100
|
+
HeaderCell(value="Model"),
|
|
101
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
102
|
+
HeaderCell(value="Scenario B", lower_is_better=False),
|
|
103
|
+
HeaderCell(value="Scenario C", lower_is_better=False),
|
|
104
|
+
]
|
|
105
|
+
values = [
|
|
106
|
+
["Model A", 1, None, None],
|
|
107
|
+
["Model B", 2, None, 1],
|
|
108
|
+
["Model C", 3, None, None],
|
|
109
|
+
["Model D", 4, None, None],
|
|
110
|
+
["Model E", 5, None, None],
|
|
111
|
+
]
|
|
112
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
113
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
114
|
+
assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_compute_win_rates_ignore_unset_lower_is_better():
|
|
118
|
+
header = [
|
|
119
|
+
HeaderCell(value="Model"),
|
|
120
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
121
|
+
HeaderCell(value="Scenario B"),
|
|
122
|
+
]
|
|
123
|
+
values = [
|
|
124
|
+
["Model A", 1, 3],
|
|
125
|
+
["Model B", 2, 1],
|
|
126
|
+
["Model C", 3, 2],
|
|
127
|
+
["Model D", 4, 5],
|
|
128
|
+
["Model E", 5, 4],
|
|
129
|
+
]
|
|
130
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
131
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
132
|
+
assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_compute_win_rates_no_win_rate():
|
|
136
|
+
header = [
|
|
137
|
+
HeaderCell(value="Model"),
|
|
138
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
139
|
+
]
|
|
140
|
+
values = [
|
|
141
|
+
["Model A", None],
|
|
142
|
+
["Model B", None],
|
|
143
|
+
["Model C", None],
|
|
144
|
+
["Model D", None],
|
|
145
|
+
["Model E", None],
|
|
146
|
+
]
|
|
147
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
148
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
149
|
+
assert compute_aggregate_row_win_rates(table) == [None, None, None, None, None]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_compute_win_rates_ties():
|
|
153
|
+
header = [
|
|
154
|
+
HeaderCell(value="Model"),
|
|
155
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
156
|
+
]
|
|
157
|
+
values = [
|
|
158
|
+
["Model A", 1],
|
|
159
|
+
["Model B", 1],
|
|
160
|
+
["Model C", 1],
|
|
161
|
+
["Model D", 4],
|
|
162
|
+
["Model E", 5],
|
|
163
|
+
]
|
|
164
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
165
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
166
|
+
assert compute_aggregate_row_win_rates(table) == [0.25, 0.25, 0.25, 0.75, 1.0]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_compute_win_rates_lower_is_better():
|
|
170
|
+
header = [
|
|
171
|
+
HeaderCell(value="Model"),
|
|
172
|
+
HeaderCell(value="Scenario A", lower_is_better=True),
|
|
173
|
+
]
|
|
174
|
+
values = [
|
|
175
|
+
["Model A", 1],
|
|
176
|
+
["Model B", 2],
|
|
177
|
+
["Model C", 3],
|
|
178
|
+
["Model D", 4],
|
|
179
|
+
["Model E", 5],
|
|
180
|
+
]
|
|
181
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
182
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
183
|
+
assert compute_aggregate_row_win_rates(table) == [1, 0.75, 0.5, 0.25, 0]
|