crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
import dacite
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import typing
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from dataclasses import replace
|
|
11
|
+
from datasets import load_dataset
|
|
12
|
+
|
|
13
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
|
+
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
15
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
16
|
+
from helm.common.cache import cache_stats
|
|
17
|
+
from helm.benchmark.scenarios.scenario import (
|
|
18
|
+
Scenario,
|
|
19
|
+
create_scenario,
|
|
20
|
+
Instance,
|
|
21
|
+
get_scenario_cache_path,
|
|
22
|
+
with_instance_ids,
|
|
23
|
+
)
|
|
24
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
25
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
|
|
26
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
27
|
+
from helm.benchmark.run_spec import RunSpec
|
|
28
|
+
from helm.benchmark.data_preprocessor import DataPreprocessor
|
|
29
|
+
from helm.benchmark.executor import ExecutionSpec
|
|
30
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
31
|
+
from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
|
|
32
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
|
|
33
|
+
from helm.benchmark.runner import (
|
|
34
|
+
Runner,
|
|
35
|
+
remove_stats_nans,
|
|
36
|
+
remove_per_instance_stats_nans,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
scenario_to_metric_name = {
|
|
40
|
+
"air_bench_2024": "air_score",
|
|
41
|
+
"babi_qa": "quasi_exact_match",
|
|
42
|
+
"bbq": "quasi_exact_match",
|
|
43
|
+
"blimp": "exact_match",
|
|
44
|
+
"boolq": "quasi_exact_match",
|
|
45
|
+
"civil_comments": "quasi_exact_match",
|
|
46
|
+
"dyck_language": "exact_match_indicator",
|
|
47
|
+
"entity_data_imputation": "quasi_exact_match",
|
|
48
|
+
"entity_matching": "quasi_exact_match",
|
|
49
|
+
"imdb": "quasi_exact_match",
|
|
50
|
+
"legal_support": "quasi_exact_match",
|
|
51
|
+
"raft": "quasi_exact_match",
|
|
52
|
+
"synthetic_reasoning": "quasi_exact_match",
|
|
53
|
+
"truthful_qa": "exact_match",
|
|
54
|
+
"wikifact": "quasi_exact_match",
|
|
55
|
+
"mmlu": "exact_match",
|
|
56
|
+
"commonsense": "exact_match",
|
|
57
|
+
"gsm": "final_number_exact_match",
|
|
58
|
+
# "gsm": ["exact_match_indicator", "final_number_exact_match"],
|
|
59
|
+
"legalbench": "quasi_exact_match",
|
|
60
|
+
"math": "math_equiv_chain_of_thought",
|
|
61
|
+
"med_qa": "quasi_exact_match",
|
|
62
|
+
"thai_exam": "exact_match",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class REEvalRunner(Runner):
|
|
67
|
+
"""
|
|
68
|
+
This runner implements the basic (non-amortized) method described in the paper
|
|
69
|
+
`Reliable and Efficient Amortized Model-Based Evaluation`. This approach, which is
|
|
70
|
+
also known as Computerized Adaptive Testing (CAT) within the framework of Item Response
|
|
71
|
+
Theory (IRT), leverages adaptive testing to evaluate model performance.
|
|
72
|
+
|
|
73
|
+
The difficulties of the questions are provided in a HuggingFace repository. In addition,
|
|
74
|
+
the authors of the paper will supply a Python package for calculating these difficulties.
|
|
75
|
+
At each iteration, the runner estimates the model's ability based on all previously
|
|
76
|
+
administered questions and their corresponding responses. It then selects the next question
|
|
77
|
+
whose difficulty is closest to the estimated ability, thereby reliably and efficiently
|
|
78
|
+
eliciting the model's ability.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
execution_spec: ExecutionSpec,
|
|
84
|
+
output_path: str,
|
|
85
|
+
suite: str,
|
|
86
|
+
skip_instances: bool,
|
|
87
|
+
cache_instances: bool,
|
|
88
|
+
cache_instances_only: bool,
|
|
89
|
+
skip_completed_runs: bool,
|
|
90
|
+
exit_on_error: bool,
|
|
91
|
+
):
|
|
92
|
+
super().__init__(
|
|
93
|
+
execution_spec=execution_spec,
|
|
94
|
+
output_path=output_path,
|
|
95
|
+
suite=suite,
|
|
96
|
+
skip_instances=skip_instances,
|
|
97
|
+
cache_instances=cache_instances,
|
|
98
|
+
cache_instances_only=cache_instances_only,
|
|
99
|
+
skip_completed_runs=skip_completed_runs,
|
|
100
|
+
exit_on_error=exit_on_error,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def _estimate_model_ability(
|
|
104
|
+
self,
|
|
105
|
+
old_ability: float,
|
|
106
|
+
response_correctness: List[float],
|
|
107
|
+
instance_difficulties: List[float],
|
|
108
|
+
) -> float:
|
|
109
|
+
def closure():
|
|
110
|
+
optim.zero_grad()
|
|
111
|
+
probs = torch.sigmoid(ability + difficulties)
|
|
112
|
+
loss = -torch.distributions.Bernoulli(probs=probs).log_prob(responses).mean()
|
|
113
|
+
loss.backward()
|
|
114
|
+
return loss
|
|
115
|
+
|
|
116
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
117
|
+
responses = torch.tensor(response_correctness, device=device)
|
|
118
|
+
difficulties = torch.tensor(instance_difficulties, device=device)
|
|
119
|
+
ability = torch.tensor([old_ability], requires_grad=True, device=device)
|
|
120
|
+
optim = torch.optim.LBFGS([ability], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe")
|
|
121
|
+
|
|
122
|
+
for iteration in range(100):
|
|
123
|
+
loss = optim.step(closure)
|
|
124
|
+
|
|
125
|
+
if iteration > 0:
|
|
126
|
+
prev_ability = ability.clone()
|
|
127
|
+
prev_loss = loss
|
|
128
|
+
d_loss = prev_loss - loss
|
|
129
|
+
d_theta = torch.norm(prev_ability - ability, p=2)
|
|
130
|
+
grad_norm = torch.norm(optim.param_groups[0]["params"][0].grad, p=2)
|
|
131
|
+
if d_loss < 1e-5 and d_theta < 1e-5 and grad_norm < 1e-5:
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
return ability.item()
|
|
135
|
+
|
|
136
|
+
def run_one(self, run_spec: RunSpec):
|
|
137
|
+
run_path: str = self._get_run_path(run_spec)
|
|
138
|
+
if self.skip_completed_runs and self._is_run_completed(run_path):
|
|
139
|
+
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
140
|
+
return
|
|
141
|
+
ensure_directory_exists(run_path)
|
|
142
|
+
|
|
143
|
+
# Load the scenario
|
|
144
|
+
scenario: Scenario = create_scenario(run_spec.scenario_spec)
|
|
145
|
+
|
|
146
|
+
# This 'output_path' will be used when the model's input instances are saved.
|
|
147
|
+
args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
|
|
148
|
+
scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
|
|
149
|
+
input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
|
|
150
|
+
input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
|
|
151
|
+
|
|
152
|
+
instances: List[Instance]
|
|
153
|
+
if self.skip_instances:
|
|
154
|
+
instances = []
|
|
155
|
+
else:
|
|
156
|
+
if self.cache_instances and os.path.exists(input_instances_file_path):
|
|
157
|
+
with open(input_instances_file_path) as f:
|
|
158
|
+
json_instances: List[Dict[str, Any]] = json.load(f)
|
|
159
|
+
instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
|
|
160
|
+
else:
|
|
161
|
+
# Create the instances of the scenario
|
|
162
|
+
scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
|
|
163
|
+
with htrack_block("scenario.get_instances"):
|
|
164
|
+
instances = scenario.get_instances(scenario_output_path)
|
|
165
|
+
if self.cache_instances and not os.path.exists(input_instances_file_path):
|
|
166
|
+
# Save instances to file
|
|
167
|
+
ensure_directory_exists(input_instances_output_path)
|
|
168
|
+
write(
|
|
169
|
+
os.path.join(input_instances_file_path),
|
|
170
|
+
json.dumps([asdict_without_nones(instance) for instance in instances], indent=2),
|
|
171
|
+
)
|
|
172
|
+
if self.cache_instances_only:
|
|
173
|
+
return # Exit after saving the instances.
|
|
174
|
+
|
|
175
|
+
# Give each instance a unique ID
|
|
176
|
+
if any([instance.id is None for instance in instances]):
|
|
177
|
+
instances = with_instance_ids(instances)
|
|
178
|
+
|
|
179
|
+
# Data preprocessing
|
|
180
|
+
instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
|
|
181
|
+
instances, self.executor.execution_spec.parallelism
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Adapt (convert to requests)
|
|
185
|
+
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
186
|
+
unasked_request_states_without_difficulty: List[RequestState] = adapter.adapt(
|
|
187
|
+
instances, self.executor.execution_spec.parallelism
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# load difficulty
|
|
191
|
+
split_name = "dyck_language_np_3" if scenario.name == "dyck_language" else scenario.name
|
|
192
|
+
try:
|
|
193
|
+
difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
|
|
194
|
+
prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
|
|
195
|
+
except ValueError:
|
|
196
|
+
hlog(f"WARNING: no available difficulty for {split_name}, skipping")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
unasked_request_states: List[RequestState] = []
|
|
200
|
+
for request_state in unasked_request_states_without_difficulty:
|
|
201
|
+
prompt = request_state.request.prompt
|
|
202
|
+
if prompt in prompt_to_difficulty:
|
|
203
|
+
difficulty = prompt_to_difficulty[prompt]
|
|
204
|
+
current_extra_data = request_state.instance.extra_data or {}
|
|
205
|
+
if "difficulty" in current_extra_data:
|
|
206
|
+
raise Exception("Extra_data already contains a 'difficulty' key.")
|
|
207
|
+
new_extra_data = current_extra_data.copy()
|
|
208
|
+
new_extra_data["difficulty"] = difficulty
|
|
209
|
+
new_instance = replace(request_state.instance, extra_data=new_extra_data)
|
|
210
|
+
new_request_state = replace(request_state, instance=new_instance)
|
|
211
|
+
unasked_request_states.append(new_request_state)
|
|
212
|
+
assert unasked_request_states
|
|
213
|
+
|
|
214
|
+
# Execute the requests in an reeval manner
|
|
215
|
+
assert run_spec.adapter_spec.reeval_parameters is not None
|
|
216
|
+
model_ability: float = run_spec.adapter_spec.reeval_parameters.model_ability or 0.0
|
|
217
|
+
scenario_metric_name: str = scenario_to_metric_name[scenario.name]
|
|
218
|
+
|
|
219
|
+
asked_request_states: List[RequestState] = []
|
|
220
|
+
reeval_trajectory: Dict[str, List[float]] = {
|
|
221
|
+
"model_ability": [],
|
|
222
|
+
"response_correctness": [],
|
|
223
|
+
"instance_difficulties": [],
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
assert run_spec.adapter_spec.max_eval_instances is not None
|
|
227
|
+
for _ in tqdm(range(run_spec.adapter_spec.max_eval_instances), desc="REEval Execution"):
|
|
228
|
+
if not unasked_request_states:
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
selected_item: Optional[RequestState] = None
|
|
232
|
+
min_diff = float("inf")
|
|
233
|
+
for item in unasked_request_states:
|
|
234
|
+
assert item.instance.extra_data is not None
|
|
235
|
+
diff = abs(item.instance.extra_data["difficulty"] + model_ability)
|
|
236
|
+
if diff < min_diff:
|
|
237
|
+
min_diff = diff
|
|
238
|
+
selected_item = item
|
|
239
|
+
assert selected_item is not None
|
|
240
|
+
unasked_request_states.remove(selected_item)
|
|
241
|
+
|
|
242
|
+
# Execute the request
|
|
243
|
+
single_scenario_state: ScenarioState = ScenarioState(
|
|
244
|
+
adapter_spec=run_spec.adapter_spec,
|
|
245
|
+
request_states=[selected_item],
|
|
246
|
+
annotator_specs=run_spec.annotators,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Execute (fill up results)
|
|
250
|
+
single_scenario_state = self.executor.execute(single_scenario_state)
|
|
251
|
+
|
|
252
|
+
# Annotate (post-process the results)
|
|
253
|
+
single_scenario_state = self.annotator_executor.execute(single_scenario_state)
|
|
254
|
+
|
|
255
|
+
# Apply the metrics
|
|
256
|
+
# When performing a dry run, only estimate the number of tokens instead
|
|
257
|
+
# of calculating the metrics.
|
|
258
|
+
metrics: List[MetricInterface] = (
|
|
259
|
+
[DryRunMetric()]
|
|
260
|
+
if self.dry_run
|
|
261
|
+
else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
temp_per_instance_stats: List[PerInstanceStats] = []
|
|
265
|
+
with htrack_block(f"{len(metrics)} metrics"):
|
|
266
|
+
for metric in metrics:
|
|
267
|
+
with htrack_block(metric):
|
|
268
|
+
temp_metric_result: MetricResult = metric.evaluate(
|
|
269
|
+
single_scenario_state,
|
|
270
|
+
self.metric_service,
|
|
271
|
+
self.eval_cache_path,
|
|
272
|
+
self.executor.execution_spec.parallelism,
|
|
273
|
+
)
|
|
274
|
+
temp_per_instance_stats.extend(temp_metric_result.per_instance_stats)
|
|
275
|
+
|
|
276
|
+
# Update the reeval request states
|
|
277
|
+
asked_request_states.extend(single_scenario_state.request_states)
|
|
278
|
+
|
|
279
|
+
# Update the reeval trajectory
|
|
280
|
+
reeval_trajectory["model_ability"].append(model_ability)
|
|
281
|
+
scenario_metric_value = [
|
|
282
|
+
s for s in temp_per_instance_stats[0].stats if s.name.name == scenario_metric_name
|
|
283
|
+
][0].mean
|
|
284
|
+
|
|
285
|
+
assert scenario_metric_value is not None
|
|
286
|
+
reeval_trajectory["response_correctness"].append(scenario_metric_value)
|
|
287
|
+
assert selected_item.instance.extra_data is not None
|
|
288
|
+
reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
|
|
289
|
+
|
|
290
|
+
# Estimate the model ability
|
|
291
|
+
model_ability = self._estimate_model_ability(
|
|
292
|
+
old_ability=model_ability,
|
|
293
|
+
response_correctness=reeval_trajectory["response_correctness"],
|
|
294
|
+
instance_difficulties=reeval_trajectory["instance_difficulties"],
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Create the scenario state
|
|
298
|
+
scenario_state: ScenarioState = ScenarioState(
|
|
299
|
+
adapter_spec=run_spec.adapter_spec,
|
|
300
|
+
request_states=asked_request_states,
|
|
301
|
+
annotator_specs=run_spec.annotators,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
stats: List[Stat] = []
|
|
305
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
306
|
+
with htrack_block(f"{len(metrics)} metrics"):
|
|
307
|
+
for metric in metrics:
|
|
308
|
+
with htrack_block(metric):
|
|
309
|
+
metric_result: MetricResult = metric.evaluate(
|
|
310
|
+
scenario_state,
|
|
311
|
+
self.metric_service,
|
|
312
|
+
self.eval_cache_path,
|
|
313
|
+
self.executor.execution_spec.parallelism,
|
|
314
|
+
)
|
|
315
|
+
stats.extend(metric_result.aggregated_stats)
|
|
316
|
+
per_instance_stats.extend(metric_result.per_instance_stats)
|
|
317
|
+
|
|
318
|
+
# Check that there aren't duplicate `Stat`s
|
|
319
|
+
# Note: doesn't catch near misses.
|
|
320
|
+
metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
|
|
321
|
+
for metric_name, count in metric_counts.items():
|
|
322
|
+
if count > 1:
|
|
323
|
+
hlog(f"WARNING: duplicate metric name {metric_name}")
|
|
324
|
+
|
|
325
|
+
# Print out the number of stats
|
|
326
|
+
hlog(f"Generated {len(stats)} stats.")
|
|
327
|
+
|
|
328
|
+
if self.skip_instances:
|
|
329
|
+
hlog("skip_instances was True. Skipping writing results out.")
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
# Output benchmarking information and results to files
|
|
333
|
+
write(os.path.join(run_path, "run_spec.json"), json.dumps(asdict_without_nones(run_spec), indent=2))
|
|
334
|
+
|
|
335
|
+
# Write out scenario
|
|
336
|
+
write(os.path.join(run_path, "scenario.json"), json.dumps(asdict_without_nones(scenario), indent=2))
|
|
337
|
+
|
|
338
|
+
# Write scenario state
|
|
339
|
+
write(os.path.join(run_path, "scenario_state.json"), json.dumps(asdict_without_nones(scenario_state), indent=2))
|
|
340
|
+
|
|
341
|
+
write(
|
|
342
|
+
os.path.join(run_path, "stats.json"),
|
|
343
|
+
json.dumps([asdict_without_nones(stat) for stat in remove_stats_nans(stats)], indent=2),
|
|
344
|
+
)
|
|
345
|
+
write(
|
|
346
|
+
os.path.join(run_path, "per_instance_stats.json"),
|
|
347
|
+
json.dumps(list(map(asdict_without_nones, remove_per_instance_stats_nans(per_instance_stats))), indent=2),
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
write(
|
|
351
|
+
os.path.join(run_path, "reeval_trajectory.json"),
|
|
352
|
+
json.dumps(reeval_trajectory, indent=2),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
cache_stats.print_status()
|
helm/benchmark/run.py
CHANGED
|
@@ -266,13 +266,6 @@ def main():
|
|
|
266
266
|
default=None,
|
|
267
267
|
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
268
268
|
)
|
|
269
|
-
parser.add_argument(
|
|
270
|
-
"--openvino",
|
|
271
|
-
action="store_true",
|
|
272
|
-
default=False,
|
|
273
|
-
help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
|
|
274
|
-
"specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
|
|
275
|
-
)
|
|
276
269
|
add_run_args(parser)
|
|
277
270
|
args = parser.parse_args()
|
|
278
271
|
validate_args(args)
|
|
@@ -284,19 +277,13 @@ def main():
|
|
|
284
277
|
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
285
278
|
|
|
286
279
|
for huggingface_model_name in args.enable_huggingface_models:
|
|
287
|
-
|
|
288
|
-
register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
|
|
289
|
-
else:
|
|
290
|
-
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
280
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
291
281
|
|
|
292
282
|
if args.enable_local_huggingface_models:
|
|
293
283
|
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
294
284
|
|
|
295
285
|
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
296
|
-
|
|
297
|
-
register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
|
|
298
|
-
else:
|
|
299
|
-
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
286
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
300
287
|
|
|
301
288
|
run_entries: List[RunEntry] = []
|
|
302
289
|
if args.conf_paths:
|
|
@@ -323,12 +310,16 @@ def main():
|
|
|
323
310
|
if model_to_run not in all_models:
|
|
324
311
|
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
325
312
|
else:
|
|
326
|
-
|
|
313
|
+
model_expander_wildcard_pattern = re.compile(
|
|
327
314
|
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
328
315
|
)
|
|
329
|
-
if any(
|
|
316
|
+
if any(model_expander_wildcard_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
330
317
|
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
331
318
|
|
|
319
|
+
model_expander_pattern = re.compile(r"\bmodel=\b")
|
|
320
|
+
if not any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
321
|
+
raise Exception("--models-to-run must be set if the `models=` run expander is omitted")
|
|
322
|
+
|
|
332
323
|
run_specs = run_entries_to_run_specs(
|
|
333
324
|
run_entries=run_entries,
|
|
334
325
|
max_eval_instances=args.max_eval_instances,
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from dataclasses import replace
|
|
4
4
|
from typing import Any, List, Dict, Optional, Tuple, Type
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
6
7
|
from helm.benchmark.model_metadata_registry import (
|
|
7
8
|
get_all_instruction_following_models,
|
|
8
9
|
get_all_code_models,
|
|
@@ -11,19 +12,21 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
11
12
|
get_model_metadata,
|
|
12
13
|
get_model_names_with_tag,
|
|
13
14
|
DEPRECATED_MODEL_TAG,
|
|
15
|
+
UNSUPPORTED_MODEL_TAG,
|
|
14
16
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
15
17
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
16
18
|
ABLATION_MODEL_TAG,
|
|
17
19
|
TEXT_TO_IMAGE_MODEL_TAG,
|
|
18
20
|
VISION_LANGUAGE_MODEL_TAG,
|
|
21
|
+
AUDIO_LANGUAGE_MODEL_TAG,
|
|
19
22
|
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
20
23
|
)
|
|
21
24
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
22
25
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
23
|
-
from .run_spec import RunSpec
|
|
26
|
+
from helm.benchmark.run_spec import RunSpec
|
|
24
27
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
25
|
-
from .augmentations.perturbation import PerturbationSpec
|
|
26
|
-
from .augmentations.data_augmenter import DataAugmenterSpec
|
|
28
|
+
from helm.benchmark.augmentations.perturbation import PerturbationSpec
|
|
29
|
+
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
|
|
27
30
|
from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
|
|
28
31
|
|
|
29
32
|
|
|
@@ -347,6 +350,29 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
347
350
|
return [run_spec]
|
|
348
351
|
|
|
349
352
|
|
|
353
|
+
class NovaRunExpander(RunExpander):
|
|
354
|
+
"""
|
|
355
|
+
Custom prompt for Amazon Nova models.
|
|
356
|
+
These models need more explicit instructions about following the format.
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
name = "amazon-nova"
|
|
360
|
+
|
|
361
|
+
PROMPT = "Do not provide any additional explanation. Follow the format shown in the provided examples strictly."
|
|
362
|
+
|
|
363
|
+
def __init__(self):
|
|
364
|
+
pass
|
|
365
|
+
|
|
366
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
367
|
+
return [
|
|
368
|
+
replace(
|
|
369
|
+
run_spec,
|
|
370
|
+
name=run_spec.name,
|
|
371
|
+
adapter_spec=replace(run_spec.adapter_spec, global_prefix=NovaRunExpander.PROMPT + "\n\n"),
|
|
372
|
+
),
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
|
|
350
376
|
class FollowFormatInstructionsRunExpander(RunExpander):
|
|
351
377
|
"""Adds more explicit instructions about following the format to prompts.
|
|
352
378
|
|
|
@@ -588,6 +614,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
588
614
|
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
589
615
|
"text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
|
|
590
616
|
"vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
|
|
617
|
+
"audiolm": get_model_names_with_tag(AUDIO_LANGUAGE_MODEL_TAG),
|
|
591
618
|
}
|
|
592
619
|
|
|
593
620
|
# For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
|
|
@@ -604,8 +631,10 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
604
631
|
|
|
605
632
|
# For each of the keys above, filter out deprecated models.
|
|
606
633
|
deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
|
|
634
|
+
unsupported_models = set(get_model_names_with_tag(UNSUPPORTED_MODEL_TAG))
|
|
635
|
+
excluded_models = deprecated_models | unsupported_models
|
|
607
636
|
for family_name in values_dict.keys():
|
|
608
|
-
values_dict[family_name] = [model for model in values_dict[family_name] if model not in
|
|
637
|
+
values_dict[family_name] = [model for model in values_dict[family_name] if model not in excluded_models]
|
|
609
638
|
|
|
610
639
|
return values_dict
|
|
611
640
|
|
|
@@ -1424,14 +1453,20 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1424
1453
|
name = "output_format_instructions"
|
|
1425
1454
|
|
|
1426
1455
|
_SUFFIX_SUFFIX = "_suffix"
|
|
1456
|
+
_NO_PREFIX_SUFFIX = "_no_prefix"
|
|
1427
1457
|
|
|
1428
1458
|
def __init__(self, scenario: str):
|
|
1459
|
+
self.suffix = False
|
|
1429
1460
|
if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
|
|
1430
|
-
|
|
1461
|
+
scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX)
|
|
1431
1462
|
self.suffix = True
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1463
|
+
|
|
1464
|
+
self.no_prefix = False
|
|
1465
|
+
if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX):
|
|
1466
|
+
scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX)
|
|
1467
|
+
self.no_prefix = True
|
|
1468
|
+
|
|
1469
|
+
self.scenario = scenario
|
|
1435
1470
|
|
|
1436
1471
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1437
1472
|
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
@@ -1441,6 +1476,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1441
1476
|
instructions = "Answer with only a single letter."
|
|
1442
1477
|
elif self.scenario == "mcqa":
|
|
1443
1478
|
instructions = "Answer with only a single letter."
|
|
1479
|
+
elif self.scenario == "mcqa_only_last_question":
|
|
1480
|
+
instructions = "Answer only the last question with only a single letter."
|
|
1444
1481
|
else:
|
|
1445
1482
|
instructions = "Answer with only a single letter."
|
|
1446
1483
|
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
@@ -1452,6 +1489,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1452
1489
|
)
|
|
1453
1490
|
elif self.scenario == "natural_qa":
|
|
1454
1491
|
instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
|
|
1492
|
+
elif self.scenario == "natural_qa_short_answer":
|
|
1493
|
+
instructions = "Answer with a short answer."
|
|
1455
1494
|
elif self.scenario == "legalbench":
|
|
1456
1495
|
if output_noun != "Answer":
|
|
1457
1496
|
instructions = f"Answer with the {output_noun.lower()}."
|
|
@@ -1483,6 +1522,11 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1483
1522
|
else:
|
|
1484
1523
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1485
1524
|
|
|
1525
|
+
if self.no_prefix:
|
|
1526
|
+
if instructions:
|
|
1527
|
+
instructions += " "
|
|
1528
|
+
instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
|
|
1529
|
+
|
|
1486
1530
|
if self.suffix:
|
|
1487
1531
|
return [
|
|
1488
1532
|
replace(
|
|
@@ -1506,6 +1550,31 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1506
1550
|
]
|
|
1507
1551
|
|
|
1508
1552
|
|
|
1553
|
+
class ProcessOutputRunExpander(RunExpander):
|
|
1554
|
+
name = "process_output"
|
|
1555
|
+
|
|
1556
|
+
def __init__(self, processor: str):
|
|
1557
|
+
self.processor = processor
|
|
1558
|
+
|
|
1559
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1560
|
+
output_processing_metric_spec = MetricSpec(
|
|
1561
|
+
class_name="helm.benchmark.metrics.output_processing_metric.OutputProcessingMetric",
|
|
1562
|
+
args={
|
|
1563
|
+
"processor": self.processor,
|
|
1564
|
+
"metric_specs": [
|
|
1565
|
+
{"class_name": metric_spec.class_name, "args": metric_spec.args}
|
|
1566
|
+
for metric_spec in run_spec.metric_specs
|
|
1567
|
+
],
|
|
1568
|
+
},
|
|
1569
|
+
)
|
|
1570
|
+
return [
|
|
1571
|
+
replace(
|
|
1572
|
+
run_spec,
|
|
1573
|
+
metric_specs=[output_processing_metric_spec],
|
|
1574
|
+
),
|
|
1575
|
+
]
|
|
1576
|
+
|
|
1577
|
+
|
|
1509
1578
|
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
1510
1579
|
InstructionsRunExpander,
|
|
1511
1580
|
PromptRunExpander,
|
|
@@ -1532,6 +1601,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1532
1601
|
TemperatureRunExpander,
|
|
1533
1602
|
IncreaseTemperatureRunExpander,
|
|
1534
1603
|
IncreaseMaxTokensRunExpander,
|
|
1604
|
+
ProcessOutputRunExpander,
|
|
1535
1605
|
]
|
|
1536
1606
|
|
|
1537
1607
|
|
|
@@ -37,6 +37,7 @@ from helm.benchmark.run_expander import (
|
|
|
37
37
|
IncreaseTemperatureRunExpander,
|
|
38
38
|
IncreaseMaxTokensRunExpander,
|
|
39
39
|
LlavaRunExpander,
|
|
40
|
+
ModelRunExpander,
|
|
40
41
|
OpenFlamingoRunExpander,
|
|
41
42
|
StopRunExpander,
|
|
42
43
|
)
|
|
@@ -61,6 +62,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
61
62
|
expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
|
|
62
63
|
args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
|
|
63
64
|
|
|
65
|
+
# If no model run expander was specified, add the model=all run expander
|
|
66
|
+
if not any([expander for expander in expanders if isinstance(expander, ModelRunExpander)]):
|
|
67
|
+
expanders.append(ModelRunExpander("all"))
|
|
68
|
+
|
|
64
69
|
run_specs: List[RunSpec] = [run_spec_function(**args)]
|
|
65
70
|
|
|
66
71
|
# Apply expanders
|
|
@@ -138,6 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
138
143
|
):
|
|
139
144
|
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
140
145
|
|
|
146
|
+
if model.name == "openai/o1-2024-12-17":
|
|
147
|
+
# From https://platform.openai.com/docs/guides/reasoning,
|
|
148
|
+
# "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
|
|
149
|
+
# experimenting with these models. As you become familiar with the number of reasoning tokens your
|
|
150
|
+
# prompts require, you can adjust this buffer accordingly."
|
|
151
|
+
run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
|
|
152
|
+
|
|
141
153
|
# IDEFICS special handling
|
|
142
154
|
if IDEFICS_MODEL_TAG in model.tags:
|
|
143
155
|
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
1
3
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
|
|
2
4
|
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
3
5
|
from helm.benchmark.metrics.metric import MetricSpec
|
|
@@ -6,7 +8,10 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
@run_spec_function("air_bench_2024")
|
|
9
|
-
def get_air_bench_2024_spec(
|
|
11
|
+
def get_air_bench_2024_spec(
|
|
12
|
+
annotator_model: Optional[str] = None, annotator_model_deployment: Optional[str] = None
|
|
13
|
+
) -> RunSpec:
|
|
14
|
+
run_spec_name = "air_bench_2024"
|
|
10
15
|
adapter_spec = AdapterSpec(
|
|
11
16
|
method=ADAPT_GENERATION,
|
|
12
17
|
global_prefix="",
|
|
@@ -24,14 +29,27 @@ def get_air_bench_2024_spec() -> RunSpec:
|
|
|
24
29
|
stop_sequences=[],
|
|
25
30
|
)
|
|
26
31
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
|
|
27
|
-
|
|
32
|
+
annotator_args: Dict[str, str] = {}
|
|
33
|
+
if annotator_model:
|
|
34
|
+
annotator_args["model"] = annotator_model
|
|
35
|
+
annotator_args["model_deployment"] = annotator_model_deployment or annotator_model
|
|
36
|
+
run_spec_name = (
|
|
37
|
+
"air_bench_2024:"
|
|
38
|
+
f"annotator_model={annotator_args['model']},"
|
|
39
|
+
f"annotator_model_deployment={annotator_args['model_deployment']}"
|
|
40
|
+
)
|
|
41
|
+
annotator_specs = [
|
|
42
|
+
AnnotatorSpec(
|
|
43
|
+
class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator", args=annotator_args
|
|
44
|
+
)
|
|
45
|
+
]
|
|
28
46
|
metric_specs = [
|
|
29
47
|
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
|
|
30
48
|
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
|
|
31
49
|
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
|
|
32
50
|
]
|
|
33
51
|
return RunSpec(
|
|
34
|
-
name=
|
|
52
|
+
name=run_spec_name,
|
|
35
53
|
scenario_spec=scenario_spec,
|
|
36
54
|
adapter_spec=adapter_spec,
|
|
37
55
|
metric_specs=metric_specs,
|