crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -2,12 +2,12 @@ from typing import Dict
|
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from .ai21_token_cost_estimator import AI21TokenCostEstimator
|
|
6
|
-
from .cohere_token_cost_estimator import CohereTokenCostEstimator
|
|
7
|
-
from .free_token_cost_estimator import FreeTokenCostEstimator
|
|
8
|
-
from .gooseai_token_cost_estimator import GooseAITokenCostEstimator
|
|
9
|
-
from .openai_token_cost_estimator import OpenAITokenCostEstimator
|
|
10
|
-
from .token_cost_estimator import TokenCostEstimator
|
|
5
|
+
from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
|
|
6
|
+
from helm.benchmark.metrics.tokens.cohere_token_cost_estimator import CohereTokenCostEstimator
|
|
7
|
+
from helm.benchmark.metrics.tokens.free_token_cost_estimator import FreeTokenCostEstimator
|
|
8
|
+
from helm.benchmark.metrics.tokens.gooseai_token_cost_estimator import GooseAITokenCostEstimator
|
|
9
|
+
from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
|
|
10
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class AutoTokenCostEstimator(TokenCostEstimator):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
2
2
|
from helm.common.request import Request
|
|
3
|
-
from .token_cost_estimator import TokenCostEstimator
|
|
3
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class CohereTokenCostEstimator(TokenCostEstimator):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
2
2
|
from helm.common.request import Request
|
|
3
|
-
from .token_cost_estimator import TokenCostEstimator
|
|
3
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class FreeTokenCostEstimator(TokenCostEstimator):
|
|
@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
2
2
|
from helm.benchmark.window_services.window_service import WindowService
|
|
3
3
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from .token_cost_estimator import TokenCostEstimator
|
|
5
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class GooseAITokenCostEstimator(TokenCostEstimator):
|
|
@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
2
2
|
from helm.common.request import Request
|
|
3
3
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
4
4
|
from helm.benchmark.window_services.window_service import WindowService
|
|
5
|
-
from .token_cost_estimator import TokenCostEstimator
|
|
5
|
+
from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class OpenAITokenCostEstimator(TokenCostEstimator):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from unittest.mock import Mock
|
|
2
2
|
|
|
3
3
|
from helm.common.request import Request
|
|
4
|
-
from .ai21_token_cost_estimator import AI21TokenCostEstimator
|
|
4
|
+
from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class TestAI21TokenCostEstimator:
|
|
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
|
|
|
8
8
|
from helm.common.request import Request
|
|
9
9
|
from helm.common.tokenization_request import TokenizationRequestResult, TokenizationToken
|
|
10
10
|
from helm.proxy.services.remote_service import RemoteService
|
|
11
|
-
from .openai_token_cost_estimator import OpenAITokenCostEstimator
|
|
11
|
+
from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class TestOpenAITokenCostEstimator:
|
|
@@ -6,10 +6,10 @@ from helm.common.hierarchical_logger import hlog
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
8
|
from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
|
|
9
|
-
from .metric import Metric
|
|
10
|
-
from .metric_name import MetricName
|
|
11
|
-
from .metric_service import MetricService
|
|
12
|
-
from .statistic import Stat
|
|
9
|
+
from helm.benchmark.metrics.metric import Metric
|
|
10
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
11
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
12
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class ToxicityMetric(Metric):
|
|
@@ -18,7 +18,10 @@ class UnitxtMetric(MetricInterface):
|
|
|
18
18
|
|
|
19
19
|
def __init__(self, **kwargs):
|
|
20
20
|
super().__init__()
|
|
21
|
-
|
|
21
|
+
if len(kwargs) == 1 and "recipe" in kwargs:
|
|
22
|
+
dataset_name = kwargs["recipe"]
|
|
23
|
+
else:
|
|
24
|
+
dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
|
|
22
25
|
self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
|
|
23
26
|
|
|
24
27
|
def evaluate(
|
|
@@ -324,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
|
|
|
324
324
|
|
|
325
325
|
# Compute the LPIPS score
|
|
326
326
|
assert self._lpips_metric is not None
|
|
327
|
-
score: float = self._lpips_metric(img1, img2).detach().item()
|
|
327
|
+
score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
|
|
328
328
|
return score
|
|
329
329
|
|
|
330
330
|
def _calculate_fid(self, act1, act2):
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WildBenchScoreMetric(Metric):
|
|
12
|
+
"""Score metrics for WildBench."""
|
|
13
|
+
|
|
14
|
+
def evaluate_generation(
|
|
15
|
+
self,
|
|
16
|
+
adapter_spec: AdapterSpec,
|
|
17
|
+
request_state: RequestState,
|
|
18
|
+
metric_service: MetricService,
|
|
19
|
+
eval_cache_path: str,
|
|
20
|
+
) -> List[Stat]:
|
|
21
|
+
assert request_state.annotations
|
|
22
|
+
annotations: Dict[str, Any] = request_state.annotations["wildbench"]
|
|
23
|
+
scores: List[float] = []
|
|
24
|
+
for annotation_key, annotation_value in annotations.items():
|
|
25
|
+
if annotation_key.endswith("_score") and annotation_value is not None:
|
|
26
|
+
scores.append(annotation_value)
|
|
27
|
+
if not scores:
|
|
28
|
+
raise ValueError("Could not compute WB Score because all annotators failed.")
|
|
29
|
+
score = sum(scores) / len(scores)
|
|
30
|
+
score_rescaled = (score - 1) / 9
|
|
31
|
+
return [
|
|
32
|
+
Stat(MetricName("wildbench_score")).add(score),
|
|
33
|
+
Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
|
|
34
|
+
]
|
|
@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
|
|
|
22
22
|
# OpenAI Chat format
|
|
23
23
|
OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
|
|
24
24
|
|
|
25
|
+
# For NOVA models
|
|
26
|
+
NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
|
|
27
|
+
|
|
25
28
|
# For Anthropic models
|
|
26
29
|
ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
|
|
27
30
|
ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
|
|
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
|
|
|
66
69
|
LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
|
|
67
70
|
FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
|
|
68
71
|
|
|
72
|
+
# For Audio-langauge models (AudioLMs)
|
|
73
|
+
AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
|
|
74
|
+
|
|
69
75
|
# Deprecated models that are no longer available.
|
|
70
76
|
# These are usually closed API models that have been permanently removed
|
|
71
77
|
DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
|
|
72
78
|
|
|
79
|
+
# Unsupported models.
|
|
80
|
+
# These are models that we have chosen not to support because they are
|
|
81
|
+
# private, stale, non-notable, or difficult to implement.
|
|
82
|
+
UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
|
|
83
|
+
|
|
73
84
|
# Frozen is set to false as the model_deployment_registry.py file
|
|
74
85
|
# might populate the deployment_names field.
|
|
75
86
|
|
|
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
|
|
|
208
219
|
return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
|
|
209
220
|
|
|
210
221
|
|
|
222
|
+
def is_audiolm(model_name: str) -> bool:
|
|
223
|
+
"""Returns True if the model is a audio-language model (AudioLM). False otherwise."""
|
|
224
|
+
return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
|
|
225
|
+
|
|
226
|
+
|
|
211
227
|
def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
|
|
212
228
|
"""Return placeholder ModelMetadata for an unknown model."""
|
|
213
229
|
return ModelMetadata(
|
|
@@ -52,6 +52,10 @@ from helm.benchmark.presentation.schema import (
|
|
|
52
52
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
|
|
53
53
|
from helm.benchmark.presentation.run_display import write_run_display_json
|
|
54
54
|
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
55
|
+
from helm.common.object_spec import get_class_by_name
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
MODEL_HEADER_CELL_VALUE = "Model"
|
|
55
59
|
|
|
56
60
|
|
|
57
61
|
@dataclass(frozen=True)
|
|
@@ -262,18 +266,22 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
|
262
266
|
"""
|
|
263
267
|
|
|
264
268
|
row_means: List[Optional[float]] = []
|
|
269
|
+
# if the first column contains the names of models, do not treat it like a value column
|
|
270
|
+
skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
|
|
265
271
|
|
|
266
272
|
# check for all header cells where specified, that lower_is_better is consistent
|
|
267
273
|
orderings = []
|
|
268
|
-
|
|
269
|
-
|
|
274
|
+
header_cells = table.header[1:] if skip_first_column else table.header
|
|
275
|
+
for header_cell in header_cells:
|
|
276
|
+
orderings.append(header_cell.lower_is_better)
|
|
270
277
|
if len(set(orderings)) != 1:
|
|
271
278
|
raise Exception("Cannot mean columns with different values for lower_is_better")
|
|
272
279
|
|
|
273
280
|
for row in table.rows:
|
|
274
281
|
total = 0.0
|
|
275
282
|
count = 0
|
|
276
|
-
|
|
283
|
+
row_cells = row[1:] if skip_first_column else row
|
|
284
|
+
for cell in row_cells:
|
|
277
285
|
if cell.value is not None:
|
|
278
286
|
total += float(cell.value)
|
|
279
287
|
count += 1
|
|
@@ -811,7 +819,7 @@ class Summarizer:
|
|
|
811
819
|
num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
|
|
812
820
|
|
|
813
821
|
# Column headers
|
|
814
|
-
header.append(HeaderCell(
|
|
822
|
+
header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
|
|
815
823
|
for run_group, metric_group_name in columns:
|
|
816
824
|
# check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
|
|
817
825
|
if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
|
|
@@ -969,22 +977,20 @@ class Summarizer:
|
|
|
969
977
|
if strategy == AggregationStrategy.WIN_RATE:
|
|
970
978
|
WIN_RATE_AGGREGATION = "mean"
|
|
971
979
|
win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
|
|
972
|
-
description = "How many models this model outperforms on average (over columns)."
|
|
973
980
|
aggregate_header_cells.append(
|
|
974
981
|
HeaderCell(
|
|
975
982
|
f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
|
|
976
|
-
description=
|
|
983
|
+
description="How many models this model outperforms on average (over columns).",
|
|
977
984
|
lower_is_better=False,
|
|
978
985
|
)
|
|
979
986
|
)
|
|
980
987
|
aggregate_row_values.append(win_rates)
|
|
981
988
|
elif strategy == AggregationStrategy.MEAN:
|
|
982
989
|
means = compute_aggregate_row_means(table)
|
|
983
|
-
description = "An average over columns representing the mean performance."
|
|
984
990
|
aggregate_header_cells.append(
|
|
985
991
|
HeaderCell(
|
|
986
|
-
"Mean
|
|
987
|
-
description=
|
|
992
|
+
"Mean score",
|
|
993
|
+
description="The mean of the scores from all columns.",
|
|
988
994
|
lower_is_better=table.header[0].lower_is_better,
|
|
989
995
|
)
|
|
990
996
|
)
|
|
@@ -1272,6 +1278,12 @@ def main():
|
|
|
1272
1278
|
help="Whether to allow unknown models in the metadata file",
|
|
1273
1279
|
default=True,
|
|
1274
1280
|
)
|
|
1281
|
+
parser.add_argument(
|
|
1282
|
+
"--summarizer-class-name",
|
|
1283
|
+
type=str,
|
|
1284
|
+
default=None,
|
|
1285
|
+
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1286
|
+
)
|
|
1275
1287
|
args = parser.parse_args()
|
|
1276
1288
|
|
|
1277
1289
|
release: Optional[str] = None
|
|
@@ -1301,7 +1313,8 @@ def main():
|
|
|
1301
1313
|
register_configs_from_directory(args.local_path)
|
|
1302
1314
|
|
|
1303
1315
|
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1304
|
-
|
|
1316
|
+
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1317
|
+
summarizer = summarizer_cls(
|
|
1305
1318
|
release=release,
|
|
1306
1319
|
suites=suites,
|
|
1307
1320
|
suite=suite,
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.metrics.metric import PerInstanceStats
|
|
5
|
+
from helm.benchmark.presentation.schema import MetricNameMatcher, RunGroup
|
|
6
|
+
from helm.benchmark.presentation.summarize import Run, Summarizer
|
|
7
|
+
from helm.benchmark.presentation.table import Cell
|
|
8
|
+
from helm.common.codec import from_json
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ToRRRobustnessSummarizer(Summarizer):
|
|
12
|
+
"""A Summarizer that computes robustness metrics.
|
|
13
|
+
|
|
14
|
+
This Summarizer computes a robustness metrics based on the definition in the ToRR paper.
|
|
15
|
+
The instance-level robustness score for a given model and instance is defined as
|
|
16
|
+
1 - (max_score - min_score) where max_score and min_scores are the maximum and minimum
|
|
17
|
+
scores for the model on that instance across all runs (i.e. across all augmentations
|
|
18
|
+
and serialization formats). The robustness score for a given model and scenario is
|
|
19
|
+
the mean of the model's instance-level robustness score across all instances in that scenario.
|
|
20
|
+
|
|
21
|
+
The core HELM framework does not natively support computing metrics that depend on
|
|
22
|
+
per-instance metrics across multiple runs, therefore this special summarizer is needed
|
|
23
|
+
to compute this robustness metic."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
release: Optional[str],
|
|
28
|
+
suites: Optional[List[str]],
|
|
29
|
+
suite: Optional[str],
|
|
30
|
+
schema_path: str,
|
|
31
|
+
output_path: str,
|
|
32
|
+
verbose: bool,
|
|
33
|
+
num_threads: int,
|
|
34
|
+
allow_unknown_models: bool,
|
|
35
|
+
):
|
|
36
|
+
super().__init__(
|
|
37
|
+
release,
|
|
38
|
+
suites,
|
|
39
|
+
suite,
|
|
40
|
+
schema_path,
|
|
41
|
+
output_path,
|
|
42
|
+
verbose,
|
|
43
|
+
num_threads,
|
|
44
|
+
allow_unknown_models,
|
|
45
|
+
)
|
|
46
|
+
self.run_group_to_model_name_to_robustness: Dict[str, Dict[str, float]] = {}
|
|
47
|
+
|
|
48
|
+
PERFORMANCE_METRIC_GROUP_NAME = "performance_metrics"
|
|
49
|
+
ROBUSTNESS_METRIC_GROUP_NAME = "robustness_metrics"
|
|
50
|
+
ROBUSTNESS_METRIC_NAME = "robustness"
|
|
51
|
+
|
|
52
|
+
def _get_instance_id_to_performance(
|
|
53
|
+
self, run: Run, performance_metric_matcher: MetricNameMatcher
|
|
54
|
+
) -> Dict[str, float]:
|
|
55
|
+
with open(os.path.join(run.run_path, "per_instance_stats.json")) as f:
|
|
56
|
+
per_instance_stats = from_json(f.read(), List[PerInstanceStats])
|
|
57
|
+
instance_id_to_performance: Dict[str, float] = {}
|
|
58
|
+
for per_instance_stats_item in per_instance_stats:
|
|
59
|
+
assert per_instance_stats_item.train_trial_index == 0
|
|
60
|
+
assert per_instance_stats_item.perturbation is None
|
|
61
|
+
for stat in per_instance_stats_item.stats:
|
|
62
|
+
if performance_metric_matcher.matches(stat.name):
|
|
63
|
+
assert per_instance_stats_item.instance_id not in instance_id_to_performance
|
|
64
|
+
if stat.mean is not None:
|
|
65
|
+
instance_id_to_performance[per_instance_stats_item.instance_id] = stat.mean
|
|
66
|
+
|
|
67
|
+
return instance_id_to_performance
|
|
68
|
+
|
|
69
|
+
def _compute_robustness_for_runs(self, runs: List[Run], performance_metric_matcher: MetricNameMatcher) -> float:
|
|
70
|
+
instance_id_to_performances: Dict[str, List[float]] = {}
|
|
71
|
+
for run in runs:
|
|
72
|
+
for instance_id, performance in self._get_instance_id_to_performance(
|
|
73
|
+
run, performance_metric_matcher
|
|
74
|
+
).items():
|
|
75
|
+
if instance_id not in instance_id_to_performances:
|
|
76
|
+
instance_id_to_performances[instance_id] = []
|
|
77
|
+
instance_id_to_performances[instance_id].append(performance)
|
|
78
|
+
instance_id_to_robustness: Dict[str, float] = {}
|
|
79
|
+
for instance_id, performances in instance_id_to_performances.items():
|
|
80
|
+
instance_id_to_robustness[instance_id] = 1 - (max(performances) - min(performances))
|
|
81
|
+
return sum(instance_id_to_robustness.values()) / len(instance_id_to_robustness.values())
|
|
82
|
+
|
|
83
|
+
def _compute_robustness_for_run_group(self, run_group: RunGroup) -> Dict[str, float]:
|
|
84
|
+
performance_metric_group = self.schema.name_to_metric_group[self.PERFORMANCE_METRIC_GROUP_NAME]
|
|
85
|
+
assert len(performance_metric_group.metrics) == 1
|
|
86
|
+
performance_metric_matcher = performance_metric_group.metrics[0].substitute(run_group.environment)
|
|
87
|
+
|
|
88
|
+
group_runs = [run for run in self.runs if run_group.name in run.run_spec.groups]
|
|
89
|
+
model_name_to_runs: Dict[str, List[Run]] = {}
|
|
90
|
+
|
|
91
|
+
for run in group_runs:
|
|
92
|
+
model_name = run.run_spec.adapter_spec.model
|
|
93
|
+
if model_name not in model_name_to_runs:
|
|
94
|
+
model_name_to_runs[model_name] = []
|
|
95
|
+
model_name_to_runs[run.run_spec.adapter_spec.model].append(run)
|
|
96
|
+
|
|
97
|
+
model_to_robustness: Dict[str, float] = {}
|
|
98
|
+
for model_name, model_runs in model_name_to_runs.items():
|
|
99
|
+
model_to_robustness[model_name] = self._compute_robustness_for_runs(model_runs, performance_metric_matcher)
|
|
100
|
+
return model_to_robustness
|
|
101
|
+
|
|
102
|
+
def write_groups(self):
|
|
103
|
+
for run_group in self.schema.run_groups:
|
|
104
|
+
if self.ROBUSTNESS_METRIC_GROUP_NAME and self.PERFORMANCE_METRIC_GROUP_NAME in run_group.metric_groups:
|
|
105
|
+
self.run_group_to_model_name_to_robustness[run_group.name] = self._compute_robustness_for_run_group(
|
|
106
|
+
run_group
|
|
107
|
+
)
|
|
108
|
+
return super().write_groups()
|
|
109
|
+
|
|
110
|
+
def create_cell(
|
|
111
|
+
self,
|
|
112
|
+
runs: List[Run],
|
|
113
|
+
matcher: MetricNameMatcher,
|
|
114
|
+
additional_info: Optional[str],
|
|
115
|
+
hide_value: bool = False,
|
|
116
|
+
is_scenario_table: bool = False,
|
|
117
|
+
) -> Cell:
|
|
118
|
+
"""
|
|
119
|
+
Use the metric name identified by `matcher` to pull out the stats from
|
|
120
|
+
`runs` and return a representation of the average.
|
|
121
|
+
There are four cases:
|
|
122
|
+
1. No matching runs
|
|
123
|
+
2. Matching runs but no matching stats (maybe stat was named incorrectly)
|
|
124
|
+
3. Matching runs, matching stats, but stats have count = 0, so mean is undefined
|
|
125
|
+
(e.g., bias metric ran and computed 0/0)
|
|
126
|
+
4. Matching runs, matching stats, stats with count > 0
|
|
127
|
+
|
|
128
|
+
In the first three cases, the cell value is None, but the description distinguishes between these cases.
|
|
129
|
+
"""
|
|
130
|
+
if matcher.name != self.ROBUSTNESS_METRIC_NAME:
|
|
131
|
+
return super().create_cell(runs, matcher, additional_info, hide_value, is_scenario_table)
|
|
132
|
+
|
|
133
|
+
if len(runs) == 0:
|
|
134
|
+
return Cell(value=None, description="No matching runs")
|
|
135
|
+
|
|
136
|
+
# Link the runs that this cell was aggregated from, if this is not a scenario table.
|
|
137
|
+
# Scenario tables link to the runs in the model cells,
|
|
138
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
139
|
+
run_spec_names: Optional[List] = None
|
|
140
|
+
if not is_scenario_table:
|
|
141
|
+
# Deduplicate run spec names becuase aggregated_run_specs may have duplicated
|
|
142
|
+
# run specs if a run spec belongs to multiple groups.
|
|
143
|
+
run_spec_names = []
|
|
144
|
+
run_spec_names_set = set()
|
|
145
|
+
for run in runs:
|
|
146
|
+
if run.run_spec.name not in run_spec_names_set:
|
|
147
|
+
run_spec_names.append(run.run_spec.name)
|
|
148
|
+
run_spec_names_set.add(run.run_spec.name)
|
|
149
|
+
|
|
150
|
+
run_group_set = set(runs[0].run_spec.groups) & set(self.run_group_to_model_name_to_robustness.keys())
|
|
151
|
+
assert len(run_group_set) == 1
|
|
152
|
+
run_group = next(iter(run_group_set))
|
|
153
|
+
|
|
154
|
+
model_names_set = set(run.run_spec.adapter_spec.model for run in runs)
|
|
155
|
+
assert len(model_names_set) == 1
|
|
156
|
+
model_name = next(iter(model_names_set))
|
|
157
|
+
|
|
158
|
+
value = (
|
|
159
|
+
self.run_group_to_model_name_to_robustness[run_group][model_name]
|
|
160
|
+
if self.run_group_to_model_name_to_robustness[run_group]
|
|
161
|
+
and self.run_group_to_model_name_to_robustness[run_group][model_name]
|
|
162
|
+
else None
|
|
163
|
+
)
|
|
164
|
+
description = str(round(value, 3)) if value is not None else ""
|
|
165
|
+
if hide_value:
|
|
166
|
+
value = None
|
|
167
|
+
description = ""
|
|
168
|
+
if additional_info:
|
|
169
|
+
description += "\n" + additional_info
|
|
170
|
+
if self.verbose:
|
|
171
|
+
description += "\n-- ".join(["\nRun specs:", *(run_spec_names or [])])
|
|
172
|
+
|
|
173
|
+
return Cell(
|
|
174
|
+
value=value,
|
|
175
|
+
description=description,
|
|
176
|
+
style={},
|
|
177
|
+
run_spec_names=run_spec_names,
|
|
178
|
+
)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from dataclasses import replace
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from helm.benchmark import model_metadata_registry
|
|
7
|
+
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
|
+
from helm.common.general import ensure_directory_exists
|
|
9
|
+
from helm.common.hierarchical_logger import hlog, htrack
|
|
10
|
+
from helm.common.authentication import Authentication
|
|
11
|
+
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
12
|
+
|
|
13
|
+
from helm.benchmark.config_registry import (
|
|
14
|
+
register_configs_from_directory,
|
|
15
|
+
register_builtin_configs_from_helm_package,
|
|
16
|
+
)
|
|
17
|
+
from helm.benchmark.runner import set_benchmark_output_path
|
|
18
|
+
from helm.common.reeval_parameters import REEvalParameters
|
|
19
|
+
from helm.benchmark.run import (
|
|
20
|
+
run_benchmarking,
|
|
21
|
+
validate_args,
|
|
22
|
+
add_run_args,
|
|
23
|
+
run_entries_to_run_specs,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@htrack(None)
|
|
28
|
+
def main():
|
|
29
|
+
parser = argparse.ArgumentParser()
|
|
30
|
+
add_service_args(parser)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"-c",
|
|
33
|
+
"--conf-paths",
|
|
34
|
+
nargs="+",
|
|
35
|
+
help="Where to read RunSpecs to run from",
|
|
36
|
+
default=[],
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--models-to-run",
|
|
40
|
+
nargs="+",
|
|
41
|
+
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
42
|
+
default=None,
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--groups-to-run",
|
|
46
|
+
nargs="+",
|
|
47
|
+
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
48
|
+
default=None,
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--exit-on-error",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--skip-completed-runs",
|
|
57
|
+
action="store_true",
|
|
58
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--priority",
|
|
62
|
+
type=int,
|
|
63
|
+
default=None,
|
|
64
|
+
help="Run RunSpecs with priority less than or equal to this number. "
|
|
65
|
+
"If a value for --priority is not specified, run on everything",
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"--run-specs",
|
|
69
|
+
nargs="*",
|
|
70
|
+
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
71
|
+
"Specifies run entries to run.",
|
|
72
|
+
default=[],
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--enable-huggingface-models",
|
|
77
|
+
nargs="+",
|
|
78
|
+
default=[],
|
|
79
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
80
|
+
"Format: namespace/model_name[@revision]",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--enable-local-huggingface-models",
|
|
84
|
+
nargs="+",
|
|
85
|
+
default=[],
|
|
86
|
+
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
87
|
+
)
|
|
88
|
+
# reeval parameters
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
"--model-ability",
|
|
91
|
+
type=float,
|
|
92
|
+
default=0.0,
|
|
93
|
+
help="The initial ability of the model for reeval evaluation.",
|
|
94
|
+
)
|
|
95
|
+
add_run_args(parser)
|
|
96
|
+
args = parser.parse_args()
|
|
97
|
+
validate_args(args)
|
|
98
|
+
|
|
99
|
+
register_builtin_configs_from_helm_package()
|
|
100
|
+
register_configs_from_directory(args.local_path)
|
|
101
|
+
|
|
102
|
+
if args.enable_huggingface_models:
|
|
103
|
+
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
104
|
+
|
|
105
|
+
for huggingface_model_name in args.enable_huggingface_models:
|
|
106
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
107
|
+
|
|
108
|
+
if args.enable_local_huggingface_models:
|
|
109
|
+
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
110
|
+
|
|
111
|
+
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
112
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
113
|
+
|
|
114
|
+
run_entries: List[RunEntry] = []
|
|
115
|
+
if args.conf_paths:
|
|
116
|
+
run_entries.extend(read_run_entries(args.conf_paths).entries)
|
|
117
|
+
if args.run_entries:
|
|
118
|
+
run_entries.extend(
|
|
119
|
+
[RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
|
|
120
|
+
)
|
|
121
|
+
# TODO: Remove this eventually.
|
|
122
|
+
if args.run_specs:
|
|
123
|
+
run_entries.extend(
|
|
124
|
+
[RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Must set benchmark output path before getting RunSpecs,
|
|
128
|
+
# because run spec functions can use the benchmark output directory for caching.
|
|
129
|
+
ensure_directory_exists(args.output_path)
|
|
130
|
+
set_benchmark_output_path(args.output_path)
|
|
131
|
+
|
|
132
|
+
# Validate the --models-to-run flag
|
|
133
|
+
if args.models_to_run:
|
|
134
|
+
all_models = set(model_metadata_registry.get_all_models())
|
|
135
|
+
for model_to_run in args.models_to_run:
|
|
136
|
+
if model_to_run not in all_models:
|
|
137
|
+
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
138
|
+
else:
|
|
139
|
+
model_expander_pattern = re.compile(
|
|
140
|
+
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
141
|
+
)
|
|
142
|
+
if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
143
|
+
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
144
|
+
|
|
145
|
+
run_specs = run_entries_to_run_specs(
|
|
146
|
+
run_entries=run_entries,
|
|
147
|
+
max_eval_instances=args.max_eval_instances,
|
|
148
|
+
num_train_trials=args.num_train_trials,
|
|
149
|
+
models_to_run=args.models_to_run,
|
|
150
|
+
groups_to_run=args.groups_to_run,
|
|
151
|
+
priority=args.priority,
|
|
152
|
+
)
|
|
153
|
+
hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
|
|
154
|
+
|
|
155
|
+
if len(run_specs) == 0:
|
|
156
|
+
hlog("There were no RunSpecs or they got filtered out.")
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
# Add reeval_parameters
|
|
160
|
+
run_specs = [
|
|
161
|
+
replace(
|
|
162
|
+
run_spec,
|
|
163
|
+
adapter_spec=replace(
|
|
164
|
+
run_spec.adapter_spec, reeval_parameters=REEvalParameters(model_ability=args.model_ability)
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
for run_spec in run_specs
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
auth: Authentication = (
|
|
171
|
+
Authentication("") if args.skip_instances or not args.server_url else create_authentication(args)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
run_benchmarking(
|
|
175
|
+
run_specs=run_specs,
|
|
176
|
+
auth=auth,
|
|
177
|
+
url=args.server_url,
|
|
178
|
+
local_path=args.local_path,
|
|
179
|
+
num_threads=args.num_threads,
|
|
180
|
+
output_path=args.output_path,
|
|
181
|
+
suite=args.suite,
|
|
182
|
+
dry_run=args.dry_run,
|
|
183
|
+
skip_instances=args.skip_instances,
|
|
184
|
+
cache_instances=args.cache_instances,
|
|
185
|
+
cache_instances_only=args.cache_instances_only,
|
|
186
|
+
skip_completed_runs=args.skip_completed_runs,
|
|
187
|
+
exit_on_error=args.exit_on_error,
|
|
188
|
+
runner_class_name="helm.benchmark.reeval_runner.REEvalRunner",
|
|
189
|
+
mongo_uri=args.mongo_uri,
|
|
190
|
+
disable_cache=args.disable_cache,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if args.run_specs:
|
|
194
|
+
hlog(
|
|
195
|
+
"WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
|
|
196
|
+
"Use --run-entries instead."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
hlog("Done.")
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
if __name__ == "__main__":
|
|
203
|
+
main()
|