crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
import dacite
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import typing
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from dataclasses import replace
|
|
11
|
+
from datasets import load_dataset
|
|
12
|
+
|
|
13
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
|
+
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
15
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
16
|
+
from helm.common.cache import cache_stats
|
|
17
|
+
from helm.benchmark.scenarios.scenario import (
|
|
18
|
+
Scenario,
|
|
19
|
+
create_scenario,
|
|
20
|
+
Instance,
|
|
21
|
+
get_scenario_cache_path,
|
|
22
|
+
with_instance_ids,
|
|
23
|
+
)
|
|
24
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
25
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
|
|
26
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
27
|
+
from helm.benchmark.run_spec import RunSpec
|
|
28
|
+
from helm.benchmark.data_preprocessor import DataPreprocessor
|
|
29
|
+
from helm.benchmark.executor import ExecutionSpec
|
|
30
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
31
|
+
from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
|
|
32
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
|
|
33
|
+
from helm.benchmark.runner import (
|
|
34
|
+
Runner,
|
|
35
|
+
remove_stats_nans,
|
|
36
|
+
remove_per_instance_stats_nans,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
scenario_to_metric_name = {
|
|
40
|
+
"air_bench_2024": "air_score",
|
|
41
|
+
"babi_qa": "quasi_exact_match",
|
|
42
|
+
"bbq": "quasi_exact_match",
|
|
43
|
+
"blimp": "exact_match",
|
|
44
|
+
"boolq": "quasi_exact_match",
|
|
45
|
+
"civil_comments": "quasi_exact_match",
|
|
46
|
+
"dyck_language": "exact_match_indicator",
|
|
47
|
+
"entity_data_imputation": "quasi_exact_match",
|
|
48
|
+
"entity_matching": "quasi_exact_match",
|
|
49
|
+
"imdb": "quasi_exact_match",
|
|
50
|
+
"legal_support": "quasi_exact_match",
|
|
51
|
+
"raft": "quasi_exact_match",
|
|
52
|
+
"synthetic_reasoning": "quasi_exact_match",
|
|
53
|
+
"truthful_qa": "exact_match",
|
|
54
|
+
"wikifact": "quasi_exact_match",
|
|
55
|
+
"mmlu": "exact_match",
|
|
56
|
+
"commonsense": "exact_match",
|
|
57
|
+
"gsm": "final_number_exact_match",
|
|
58
|
+
# "gsm": ["exact_match_indicator", "final_number_exact_match"],
|
|
59
|
+
"legalbench": "quasi_exact_match",
|
|
60
|
+
"math": "math_equiv_chain_of_thought",
|
|
61
|
+
"med_qa": "quasi_exact_match",
|
|
62
|
+
"thai_exam": "exact_match",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class REEvalRunner(Runner):
|
|
67
|
+
"""
|
|
68
|
+
This runner implements the basic (non-amortized) method described in the paper
|
|
69
|
+
`Reliable and Efficient Amortized Model-Based Evaluation`. This approach, which is
|
|
70
|
+
also known as Computerized Adaptive Testing (CAT) within the framework of Item Response
|
|
71
|
+
Theory (IRT), leverages adaptive testing to evaluate model performance.
|
|
72
|
+
|
|
73
|
+
The difficulties of the questions are provided in a HuggingFace repository. In addition,
|
|
74
|
+
the authors of the paper will supply a Python package for calculating these difficulties.
|
|
75
|
+
At each iteration, the runner estimates the model's ability based on all previously
|
|
76
|
+
administered questions and their corresponding responses. It then selects the next question
|
|
77
|
+
whose difficulty is closest to the estimated ability, thereby reliably and efficiently
|
|
78
|
+
eliciting the model's ability.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
execution_spec: ExecutionSpec,
|
|
84
|
+
output_path: str,
|
|
85
|
+
suite: str,
|
|
86
|
+
skip_instances: bool,
|
|
87
|
+
cache_instances: bool,
|
|
88
|
+
cache_instances_only: bool,
|
|
89
|
+
skip_completed_runs: bool,
|
|
90
|
+
exit_on_error: bool,
|
|
91
|
+
):
|
|
92
|
+
super().__init__(
|
|
93
|
+
execution_spec=execution_spec,
|
|
94
|
+
output_path=output_path,
|
|
95
|
+
suite=suite,
|
|
96
|
+
skip_instances=skip_instances,
|
|
97
|
+
cache_instances=cache_instances,
|
|
98
|
+
cache_instances_only=cache_instances_only,
|
|
99
|
+
skip_completed_runs=skip_completed_runs,
|
|
100
|
+
exit_on_error=exit_on_error,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def _estimate_model_ability(
|
|
104
|
+
self,
|
|
105
|
+
old_ability: float,
|
|
106
|
+
response_correctness: List[float],
|
|
107
|
+
instance_difficulties: List[float],
|
|
108
|
+
) -> float:
|
|
109
|
+
def closure():
|
|
110
|
+
optim.zero_grad()
|
|
111
|
+
probs = torch.sigmoid(ability + difficulties)
|
|
112
|
+
loss = -torch.distributions.Bernoulli(probs=probs).log_prob(responses).mean()
|
|
113
|
+
loss.backward()
|
|
114
|
+
return loss
|
|
115
|
+
|
|
116
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
117
|
+
responses = torch.tensor(response_correctness, device=device)
|
|
118
|
+
difficulties = torch.tensor(instance_difficulties, device=device)
|
|
119
|
+
ability = torch.tensor([old_ability], requires_grad=True, device=device)
|
|
120
|
+
optim = torch.optim.LBFGS([ability], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe")
|
|
121
|
+
|
|
122
|
+
for iteration in range(100):
|
|
123
|
+
loss = optim.step(closure)
|
|
124
|
+
|
|
125
|
+
if iteration > 0:
|
|
126
|
+
prev_ability = ability.clone()
|
|
127
|
+
prev_loss = loss
|
|
128
|
+
d_loss = prev_loss - loss
|
|
129
|
+
d_theta = torch.norm(prev_ability - ability, p=2)
|
|
130
|
+
grad_norm = torch.norm(optim.param_groups[0]["params"][0].grad, p=2)
|
|
131
|
+
if d_loss < 1e-5 and d_theta < 1e-5 and grad_norm < 1e-5:
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
return ability.item()
|
|
135
|
+
|
|
136
|
+
def run_one(self, run_spec: RunSpec):
|
|
137
|
+
run_path: str = self._get_run_path(run_spec)
|
|
138
|
+
if self.skip_completed_runs and self._is_run_completed(run_path):
|
|
139
|
+
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
140
|
+
return
|
|
141
|
+
ensure_directory_exists(run_path)
|
|
142
|
+
|
|
143
|
+
# Load the scenario
|
|
144
|
+
scenario: Scenario = create_scenario(run_spec.scenario_spec)
|
|
145
|
+
|
|
146
|
+
# This 'output_path' will be used when the model's input instances are saved.
|
|
147
|
+
args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
|
|
148
|
+
scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
|
|
149
|
+
input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
|
|
150
|
+
input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
|
|
151
|
+
|
|
152
|
+
instances: List[Instance]
|
|
153
|
+
if self.skip_instances:
|
|
154
|
+
instances = []
|
|
155
|
+
else:
|
|
156
|
+
if self.cache_instances and os.path.exists(input_instances_file_path):
|
|
157
|
+
with open(input_instances_file_path) as f:
|
|
158
|
+
json_instances: List[Dict[str, Any]] = json.load(f)
|
|
159
|
+
instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
|
|
160
|
+
else:
|
|
161
|
+
# Create the instances of the scenario
|
|
162
|
+
scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
|
|
163
|
+
with htrack_block("scenario.get_instances"):
|
|
164
|
+
instances = scenario.get_instances(scenario_output_path)
|
|
165
|
+
if self.cache_instances and not os.path.exists(input_instances_file_path):
|
|
166
|
+
# Save instances to file
|
|
167
|
+
ensure_directory_exists(input_instances_output_path)
|
|
168
|
+
write(
|
|
169
|
+
os.path.join(input_instances_file_path),
|
|
170
|
+
json.dumps([asdict_without_nones(instance) for instance in instances], indent=2),
|
|
171
|
+
)
|
|
172
|
+
if self.cache_instances_only:
|
|
173
|
+
return # Exit after saving the instances.
|
|
174
|
+
|
|
175
|
+
# Give each instance a unique ID
|
|
176
|
+
if any([instance.id is None for instance in instances]):
|
|
177
|
+
instances = with_instance_ids(instances)
|
|
178
|
+
|
|
179
|
+
# Data preprocessing
|
|
180
|
+
instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
|
|
181
|
+
instances, self.executor.execution_spec.parallelism
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Adapt (convert to requests)
|
|
185
|
+
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
186
|
+
unasked_request_states_without_difficulty: List[RequestState] = adapter.adapt(
|
|
187
|
+
instances, self.executor.execution_spec.parallelism
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# load difficulty
|
|
191
|
+
split_name = "dyck_language_np_3" if scenario.name == "dyck_language" else scenario.name
|
|
192
|
+
try:
|
|
193
|
+
difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
|
|
194
|
+
prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
|
|
195
|
+
except ValueError:
|
|
196
|
+
hlog(f"WARNING: no available difficulty for {split_name}, skipping")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
unasked_request_states: List[RequestState] = []
|
|
200
|
+
for request_state in unasked_request_states_without_difficulty:
|
|
201
|
+
prompt = request_state.request.prompt
|
|
202
|
+
if prompt in prompt_to_difficulty:
|
|
203
|
+
difficulty = prompt_to_difficulty[prompt]
|
|
204
|
+
current_extra_data = request_state.instance.extra_data or {}
|
|
205
|
+
if "difficulty" in current_extra_data:
|
|
206
|
+
raise Exception("Extra_data already contains a 'difficulty' key.")
|
|
207
|
+
new_extra_data = current_extra_data.copy()
|
|
208
|
+
new_extra_data["difficulty"] = difficulty
|
|
209
|
+
new_instance = replace(request_state.instance, extra_data=new_extra_data)
|
|
210
|
+
new_request_state = replace(request_state, instance=new_instance)
|
|
211
|
+
unasked_request_states.append(new_request_state)
|
|
212
|
+
assert unasked_request_states
|
|
213
|
+
|
|
214
|
+
# Execute the requests in an reeval manner
|
|
215
|
+
assert run_spec.adapter_spec.reeval_parameters is not None
|
|
216
|
+
model_ability: float = run_spec.adapter_spec.reeval_parameters.model_ability or 0.0
|
|
217
|
+
scenario_metric_name: str = scenario_to_metric_name[scenario.name]
|
|
218
|
+
|
|
219
|
+
asked_request_states: List[RequestState] = []
|
|
220
|
+
reeval_trajectory: Dict[str, List[float]] = {
|
|
221
|
+
"model_ability": [],
|
|
222
|
+
"response_correctness": [],
|
|
223
|
+
"instance_difficulties": [],
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
assert run_spec.adapter_spec.max_eval_instances is not None
|
|
227
|
+
for _ in tqdm(range(run_spec.adapter_spec.max_eval_instances), desc="REEval Execution"):
|
|
228
|
+
if not unasked_request_states:
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
selected_item: Optional[RequestState] = None
|
|
232
|
+
min_diff = float("inf")
|
|
233
|
+
for item in unasked_request_states:
|
|
234
|
+
assert item.instance.extra_data is not None
|
|
235
|
+
diff = abs(item.instance.extra_data["difficulty"] + model_ability)
|
|
236
|
+
if diff < min_diff:
|
|
237
|
+
min_diff = diff
|
|
238
|
+
selected_item = item
|
|
239
|
+
assert selected_item is not None
|
|
240
|
+
unasked_request_states.remove(selected_item)
|
|
241
|
+
|
|
242
|
+
# Execute the request
|
|
243
|
+
single_scenario_state: ScenarioState = ScenarioState(
|
|
244
|
+
adapter_spec=run_spec.adapter_spec,
|
|
245
|
+
request_states=[selected_item],
|
|
246
|
+
annotator_specs=run_spec.annotators,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Execute (fill up results)
|
|
250
|
+
single_scenario_state = self.executor.execute(single_scenario_state)
|
|
251
|
+
|
|
252
|
+
# Annotate (post-process the results)
|
|
253
|
+
single_scenario_state = self.annotator_executor.execute(single_scenario_state)
|
|
254
|
+
|
|
255
|
+
# Apply the metrics
|
|
256
|
+
# When performing a dry run, only estimate the number of tokens instead
|
|
257
|
+
# of calculating the metrics.
|
|
258
|
+
metrics: List[MetricInterface] = (
|
|
259
|
+
[DryRunMetric()]
|
|
260
|
+
if self.dry_run
|
|
261
|
+
else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
temp_per_instance_stats: List[PerInstanceStats] = []
|
|
265
|
+
with htrack_block(f"{len(metrics)} metrics"):
|
|
266
|
+
for metric in metrics:
|
|
267
|
+
with htrack_block(metric):
|
|
268
|
+
temp_metric_result: MetricResult = metric.evaluate(
|
|
269
|
+
single_scenario_state,
|
|
270
|
+
self.metric_service,
|
|
271
|
+
self.eval_cache_path,
|
|
272
|
+
self.executor.execution_spec.parallelism,
|
|
273
|
+
)
|
|
274
|
+
temp_per_instance_stats.extend(temp_metric_result.per_instance_stats)
|
|
275
|
+
|
|
276
|
+
# Update the reeval request states
|
|
277
|
+
asked_request_states.extend(single_scenario_state.request_states)
|
|
278
|
+
|
|
279
|
+
# Update the reeval trajectory
|
|
280
|
+
reeval_trajectory["model_ability"].append(model_ability)
|
|
281
|
+
scenario_metric_value = [
|
|
282
|
+
s for s in temp_per_instance_stats[0].stats if s.name.name == scenario_metric_name
|
|
283
|
+
][0].mean
|
|
284
|
+
|
|
285
|
+
assert scenario_metric_value is not None
|
|
286
|
+
reeval_trajectory["response_correctness"].append(scenario_metric_value)
|
|
287
|
+
assert selected_item.instance.extra_data is not None
|
|
288
|
+
reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
|
|
289
|
+
|
|
290
|
+
# Estimate the model ability
|
|
291
|
+
model_ability = self._estimate_model_ability(
|
|
292
|
+
old_ability=model_ability,
|
|
293
|
+
response_correctness=reeval_trajectory["response_correctness"],
|
|
294
|
+
instance_difficulties=reeval_trajectory["instance_difficulties"],
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Create the scenario state
|
|
298
|
+
scenario_state: ScenarioState = ScenarioState(
|
|
299
|
+
adapter_spec=run_spec.adapter_spec,
|
|
300
|
+
request_states=asked_request_states,
|
|
301
|
+
annotator_specs=run_spec.annotators,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
stats: List[Stat] = []
|
|
305
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
306
|
+
with htrack_block(f"{len(metrics)} metrics"):
|
|
307
|
+
for metric in metrics:
|
|
308
|
+
with htrack_block(metric):
|
|
309
|
+
metric_result: MetricResult = metric.evaluate(
|
|
310
|
+
scenario_state,
|
|
311
|
+
self.metric_service,
|
|
312
|
+
self.eval_cache_path,
|
|
313
|
+
self.executor.execution_spec.parallelism,
|
|
314
|
+
)
|
|
315
|
+
stats.extend(metric_result.aggregated_stats)
|
|
316
|
+
per_instance_stats.extend(metric_result.per_instance_stats)
|
|
317
|
+
|
|
318
|
+
# Check that there aren't duplicate `Stat`s
|
|
319
|
+
# Note: doesn't catch near misses.
|
|
320
|
+
metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
|
|
321
|
+
for metric_name, count in metric_counts.items():
|
|
322
|
+
if count > 1:
|
|
323
|
+
hlog(f"WARNING: duplicate metric name {metric_name}")
|
|
324
|
+
|
|
325
|
+
# Print out the number of stats
|
|
326
|
+
hlog(f"Generated {len(stats)} stats.")
|
|
327
|
+
|
|
328
|
+
if self.skip_instances:
|
|
329
|
+
hlog("skip_instances was True. Skipping writing results out.")
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
# Output benchmarking information and results to files
|
|
333
|
+
write(os.path.join(run_path, "run_spec.json"), json.dumps(asdict_without_nones(run_spec), indent=2))
|
|
334
|
+
|
|
335
|
+
# Write out scenario
|
|
336
|
+
write(os.path.join(run_path, "scenario.json"), json.dumps(asdict_without_nones(scenario), indent=2))
|
|
337
|
+
|
|
338
|
+
# Write scenario state
|
|
339
|
+
write(os.path.join(run_path, "scenario_state.json"), json.dumps(asdict_without_nones(scenario_state), indent=2))
|
|
340
|
+
|
|
341
|
+
write(
|
|
342
|
+
os.path.join(run_path, "stats.json"),
|
|
343
|
+
json.dumps([asdict_without_nones(stat) for stat in remove_stats_nans(stats)], indent=2),
|
|
344
|
+
)
|
|
345
|
+
write(
|
|
346
|
+
os.path.join(run_path, "per_instance_stats.json"),
|
|
347
|
+
json.dumps(list(map(asdict_without_nones, remove_per_instance_stats_nans(per_instance_stats))), indent=2),
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
write(
|
|
351
|
+
os.path.join(run_path, "reeval_trajectory.json"),
|
|
352
|
+
json.dumps(reeval_trajectory, indent=2),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
cache_stats.print_status()
|
helm/benchmark/run.py
CHANGED
|
@@ -266,13 +266,6 @@ def main():
|
|
|
266
266
|
default=None,
|
|
267
267
|
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
268
268
|
)
|
|
269
|
-
parser.add_argument(
|
|
270
|
-
"--openvino",
|
|
271
|
-
action="store_true",
|
|
272
|
-
default=False,
|
|
273
|
-
help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
|
|
274
|
-
"specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
|
|
275
|
-
)
|
|
276
269
|
add_run_args(parser)
|
|
277
270
|
args = parser.parse_args()
|
|
278
271
|
validate_args(args)
|
|
@@ -284,19 +277,13 @@ def main():
|
|
|
284
277
|
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
285
278
|
|
|
286
279
|
for huggingface_model_name in args.enable_huggingface_models:
|
|
287
|
-
|
|
288
|
-
register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
|
|
289
|
-
else:
|
|
290
|
-
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
280
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
291
281
|
|
|
292
282
|
if args.enable_local_huggingface_models:
|
|
293
283
|
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
294
284
|
|
|
295
285
|
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
296
|
-
|
|
297
|
-
register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
|
|
298
|
-
else:
|
|
299
|
-
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
286
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
300
287
|
|
|
301
288
|
run_entries: List[RunEntry] = []
|
|
302
289
|
if args.conf_paths:
|
|
@@ -323,12 +310,16 @@ def main():
|
|
|
323
310
|
if model_to_run not in all_models:
|
|
324
311
|
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
325
312
|
else:
|
|
326
|
-
|
|
313
|
+
model_expander_wildcard_pattern = re.compile(
|
|
327
314
|
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
328
315
|
)
|
|
329
|
-
if any(
|
|
316
|
+
if any(model_expander_wildcard_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
330
317
|
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
331
318
|
|
|
319
|
+
model_expander_pattern = re.compile(r"\bmodel=\b")
|
|
320
|
+
if not any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
321
|
+
raise Exception("--models-to-run must be set if the `models=` run expander is omitted")
|
|
322
|
+
|
|
332
323
|
run_specs = run_entries_to_run_specs(
|
|
333
324
|
run_entries=run_entries,
|
|
334
325
|
max_eval_instances=args.max_eval_instances,
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from dataclasses import replace
|
|
4
4
|
from typing import Any, List, Dict, Optional, Tuple, Type
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
6
7
|
from helm.benchmark.model_metadata_registry import (
|
|
7
8
|
get_all_instruction_following_models,
|
|
8
9
|
get_all_code_models,
|
|
@@ -11,19 +12,21 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
11
12
|
get_model_metadata,
|
|
12
13
|
get_model_names_with_tag,
|
|
13
14
|
DEPRECATED_MODEL_TAG,
|
|
15
|
+
UNSUPPORTED_MODEL_TAG,
|
|
14
16
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
15
17
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
16
18
|
ABLATION_MODEL_TAG,
|
|
17
19
|
TEXT_TO_IMAGE_MODEL_TAG,
|
|
18
20
|
VISION_LANGUAGE_MODEL_TAG,
|
|
21
|
+
AUDIO_LANGUAGE_MODEL_TAG,
|
|
19
22
|
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
20
23
|
)
|
|
21
24
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
22
25
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
23
|
-
from .run_spec import RunSpec
|
|
26
|
+
from helm.benchmark.run_spec import RunSpec
|
|
24
27
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
25
|
-
from .augmentations.perturbation import PerturbationSpec
|
|
26
|
-
from .augmentations.data_augmenter import DataAugmenterSpec
|
|
28
|
+
from helm.benchmark.augmentations.perturbation import PerturbationSpec
|
|
29
|
+
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
|
|
27
30
|
from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
|
|
28
31
|
|
|
29
32
|
|
|
@@ -347,6 +350,29 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
347
350
|
return [run_spec]
|
|
348
351
|
|
|
349
352
|
|
|
353
|
+
class NovaRunExpander(RunExpander):
|
|
354
|
+
"""
|
|
355
|
+
Custom prompt for Amazon Nova models.
|
|
356
|
+
These models need more explicit instructions about following the format.
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
name = "amazon-nova"
|
|
360
|
+
|
|
361
|
+
PROMPT = "Do not provide any additional explanation. Follow the format shown in the provided examples strictly."
|
|
362
|
+
|
|
363
|
+
def __init__(self):
|
|
364
|
+
pass
|
|
365
|
+
|
|
366
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
367
|
+
return [
|
|
368
|
+
replace(
|
|
369
|
+
run_spec,
|
|
370
|
+
name=run_spec.name,
|
|
371
|
+
adapter_spec=replace(run_spec.adapter_spec, global_prefix=NovaRunExpander.PROMPT + "\n\n"),
|
|
372
|
+
),
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
|
|
350
376
|
class FollowFormatInstructionsRunExpander(RunExpander):
|
|
351
377
|
"""Adds more explicit instructions about following the format to prompts.
|
|
352
378
|
|
|
@@ -588,6 +614,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
588
614
|
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
589
615
|
"text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
|
|
590
616
|
"vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
|
|
617
|
+
"audiolm": get_model_names_with_tag(AUDIO_LANGUAGE_MODEL_TAG),
|
|
591
618
|
}
|
|
592
619
|
|
|
593
620
|
# For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
|
|
@@ -604,8 +631,10 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
604
631
|
|
|
605
632
|
# For each of the keys above, filter out deprecated models.
|
|
606
633
|
deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
|
|
634
|
+
unsupported_models = set(get_model_names_with_tag(UNSUPPORTED_MODEL_TAG))
|
|
635
|
+
excluded_models = deprecated_models | unsupported_models
|
|
607
636
|
for family_name in values_dict.keys():
|
|
608
|
-
values_dict[family_name] = [model for model in values_dict[family_name] if model not in
|
|
637
|
+
values_dict[family_name] = [model for model in values_dict[family_name] if model not in excluded_models]
|
|
609
638
|
|
|
610
639
|
return values_dict
|
|
611
640
|
|
|
@@ -1271,6 +1300,30 @@ class IncreaseMaxTokensRunExpander(RunExpander):
|
|
|
1271
1300
|
]
|
|
1272
1301
|
|
|
1273
1302
|
|
|
1303
|
+
class TemperatureRunExpander(RunExpander):
|
|
1304
|
+
"""
|
|
1305
|
+
Run expander for setting the temperature.
|
|
1306
|
+
"""
|
|
1307
|
+
|
|
1308
|
+
name = "temperature"
|
|
1309
|
+
|
|
1310
|
+
def __init__(self, value: float):
|
|
1311
|
+
"""
|
|
1312
|
+
Args:
|
|
1313
|
+
value (float): The amount to set temperature to
|
|
1314
|
+
"""
|
|
1315
|
+
self.value = value
|
|
1316
|
+
|
|
1317
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1318
|
+
adapter_spec = replace(run_spec.adapter_spec, temperature=self.value)
|
|
1319
|
+
return [
|
|
1320
|
+
replace(
|
|
1321
|
+
run_spec,
|
|
1322
|
+
adapter_spec=adapter_spec,
|
|
1323
|
+
),
|
|
1324
|
+
]
|
|
1325
|
+
|
|
1326
|
+
|
|
1274
1327
|
class IncreaseTemperatureRunExpander(RunExpander):
|
|
1275
1328
|
"""
|
|
1276
1329
|
Run expander for increasing the temperature.
|
|
@@ -1400,14 +1453,20 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1400
1453
|
name = "output_format_instructions"
|
|
1401
1454
|
|
|
1402
1455
|
_SUFFIX_SUFFIX = "_suffix"
|
|
1456
|
+
_NO_PREFIX_SUFFIX = "_no_prefix"
|
|
1403
1457
|
|
|
1404
1458
|
def __init__(self, scenario: str):
|
|
1459
|
+
self.suffix = False
|
|
1405
1460
|
if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
|
|
1406
|
-
|
|
1461
|
+
scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX)
|
|
1407
1462
|
self.suffix = True
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1463
|
+
|
|
1464
|
+
self.no_prefix = False
|
|
1465
|
+
if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX):
|
|
1466
|
+
scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX)
|
|
1467
|
+
self.no_prefix = True
|
|
1468
|
+
|
|
1469
|
+
self.scenario = scenario
|
|
1411
1470
|
|
|
1412
1471
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1413
1472
|
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
@@ -1417,6 +1476,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1417
1476
|
instructions = "Answer with only a single letter."
|
|
1418
1477
|
elif self.scenario == "mcqa":
|
|
1419
1478
|
instructions = "Answer with only a single letter."
|
|
1479
|
+
elif self.scenario == "mcqa_only_last_question":
|
|
1480
|
+
instructions = "Answer only the last question with only a single letter."
|
|
1420
1481
|
else:
|
|
1421
1482
|
instructions = "Answer with only a single letter."
|
|
1422
1483
|
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
@@ -1428,6 +1489,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1428
1489
|
)
|
|
1429
1490
|
elif self.scenario == "natural_qa":
|
|
1430
1491
|
instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
|
|
1492
|
+
elif self.scenario == "natural_qa_short_answer":
|
|
1493
|
+
instructions = "Answer with a short answer."
|
|
1431
1494
|
elif self.scenario == "legalbench":
|
|
1432
1495
|
if output_noun != "Answer":
|
|
1433
1496
|
instructions = f"Answer with the {output_noun.lower()}."
|
|
@@ -1459,6 +1522,11 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1459
1522
|
else:
|
|
1460
1523
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1461
1524
|
|
|
1525
|
+
if self.no_prefix:
|
|
1526
|
+
if instructions:
|
|
1527
|
+
instructions += " "
|
|
1528
|
+
instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
|
|
1529
|
+
|
|
1462
1530
|
if self.suffix:
|
|
1463
1531
|
return [
|
|
1464
1532
|
replace(
|
|
@@ -1482,6 +1550,31 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1482
1550
|
]
|
|
1483
1551
|
|
|
1484
1552
|
|
|
1553
|
+
class ProcessOutputRunExpander(RunExpander):
|
|
1554
|
+
name = "process_output"
|
|
1555
|
+
|
|
1556
|
+
def __init__(self, processor: str):
|
|
1557
|
+
self.processor = processor
|
|
1558
|
+
|
|
1559
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1560
|
+
output_processing_metric_spec = MetricSpec(
|
|
1561
|
+
class_name="helm.benchmark.metrics.output_processing_metric.OutputProcessingMetric",
|
|
1562
|
+
args={
|
|
1563
|
+
"processor": self.processor,
|
|
1564
|
+
"metric_specs": [
|
|
1565
|
+
{"class_name": metric_spec.class_name, "args": metric_spec.args}
|
|
1566
|
+
for metric_spec in run_spec.metric_specs
|
|
1567
|
+
],
|
|
1568
|
+
},
|
|
1569
|
+
)
|
|
1570
|
+
return [
|
|
1571
|
+
replace(
|
|
1572
|
+
run_spec,
|
|
1573
|
+
metric_specs=[output_processing_metric_spec],
|
|
1574
|
+
),
|
|
1575
|
+
]
|
|
1576
|
+
|
|
1577
|
+
|
|
1485
1578
|
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
1486
1579
|
InstructionsRunExpander,
|
|
1487
1580
|
PromptRunExpander,
|
|
@@ -1505,6 +1598,10 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1505
1598
|
ChatMLRunExpander,
|
|
1506
1599
|
EvalSplitRunExpander,
|
|
1507
1600
|
OutputFormatInstructions,
|
|
1601
|
+
TemperatureRunExpander,
|
|
1602
|
+
IncreaseTemperatureRunExpander,
|
|
1603
|
+
IncreaseMaxTokensRunExpander,
|
|
1604
|
+
ProcessOutputRunExpander,
|
|
1508
1605
|
]
|
|
1509
1606
|
|
|
1510
1607
|
|
|
@@ -37,6 +37,7 @@ from helm.benchmark.run_expander import (
|
|
|
37
37
|
IncreaseTemperatureRunExpander,
|
|
38
38
|
IncreaseMaxTokensRunExpander,
|
|
39
39
|
LlavaRunExpander,
|
|
40
|
+
ModelRunExpander,
|
|
40
41
|
OpenFlamingoRunExpander,
|
|
41
42
|
StopRunExpander,
|
|
42
43
|
)
|
|
@@ -61,6 +62,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
61
62
|
expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
|
|
62
63
|
args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
|
|
63
64
|
|
|
65
|
+
# If no model run expander was specified, add the model=all run expander
|
|
66
|
+
if not any([expander for expander in expanders if isinstance(expander, ModelRunExpander)]):
|
|
67
|
+
expanders.append(ModelRunExpander("all"))
|
|
68
|
+
|
|
64
69
|
run_specs: List[RunSpec] = [run_spec_function(**args)]
|
|
65
70
|
|
|
66
71
|
# Apply expanders
|
|
@@ -138,6 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
138
143
|
):
|
|
139
144
|
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
140
145
|
|
|
146
|
+
if model.name == "openai/o1-2024-12-17":
|
|
147
|
+
# From https://platform.openai.com/docs/guides/reasoning,
|
|
148
|
+
# "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
|
|
149
|
+
# experimenting with these models. As you become familiar with the number of reasoning tokens your
|
|
150
|
+
# prompts require, you can adjust this buffer accordingly."
|
|
151
|
+
run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
|
|
152
|
+
|
|
141
153
|
# IDEFICS special handling
|
|
142
154
|
if IDEFICS_MODEL_TAG in model.tags:
|
|
143
155
|
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
|