crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.metrics.metric import PerInstanceStats
|
|
5
|
+
from helm.benchmark.presentation.schema import MetricNameMatcher, RunGroup
|
|
6
|
+
from helm.benchmark.presentation.summarize import Run, Summarizer
|
|
7
|
+
from helm.benchmark.presentation.table import Cell
|
|
8
|
+
from helm.common.codec import from_json
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ToRRRobustnessSummarizer(Summarizer):
|
|
12
|
+
"""A Summarizer that computes robustness metrics.
|
|
13
|
+
|
|
14
|
+
This Summarizer computes a robustness metrics based on the definition in the ToRR paper.
|
|
15
|
+
The instance-level robustness score for a given model and instance is defined as
|
|
16
|
+
1 - (max_score - min_score) where max_score and min_scores are the maximum and minimum
|
|
17
|
+
scores for the model on that instance across all runs (i.e. across all augmentations
|
|
18
|
+
and serialization formats). The robustness score for a given model and scenario is
|
|
19
|
+
the mean of the model's instance-level robustness score across all instances in that scenario.
|
|
20
|
+
|
|
21
|
+
The core HELM framework does not natively support computing metrics that depend on
|
|
22
|
+
per-instance metrics across multiple runs, therefore this special summarizer is needed
|
|
23
|
+
to compute this robustness metic."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
release: Optional[str],
|
|
28
|
+
suites: Optional[List[str]],
|
|
29
|
+
suite: Optional[str],
|
|
30
|
+
schema_path: str,
|
|
31
|
+
output_path: str,
|
|
32
|
+
verbose: bool,
|
|
33
|
+
num_threads: int,
|
|
34
|
+
allow_unknown_models: bool,
|
|
35
|
+
):
|
|
36
|
+
super().__init__(
|
|
37
|
+
release,
|
|
38
|
+
suites,
|
|
39
|
+
suite,
|
|
40
|
+
schema_path,
|
|
41
|
+
output_path,
|
|
42
|
+
verbose,
|
|
43
|
+
num_threads,
|
|
44
|
+
allow_unknown_models,
|
|
45
|
+
)
|
|
46
|
+
self.run_group_to_model_name_to_robustness: Dict[str, Dict[str, float]] = {}
|
|
47
|
+
|
|
48
|
+
PERFORMANCE_METRIC_GROUP_NAME = "performance_metrics"
|
|
49
|
+
ROBUSTNESS_METRIC_GROUP_NAME = "robustness_metrics"
|
|
50
|
+
ROBUSTNESS_METRIC_NAME = "robustness"
|
|
51
|
+
|
|
52
|
+
def _get_instance_id_to_performance(
|
|
53
|
+
self, run: Run, performance_metric_matcher: MetricNameMatcher
|
|
54
|
+
) -> Dict[str, float]:
|
|
55
|
+
with open(os.path.join(run.run_path, "per_instance_stats.json")) as f:
|
|
56
|
+
per_instance_stats = from_json(f.read(), List[PerInstanceStats])
|
|
57
|
+
instance_id_to_performance: Dict[str, float] = {}
|
|
58
|
+
for per_instance_stats_item in per_instance_stats:
|
|
59
|
+
assert per_instance_stats_item.train_trial_index == 0
|
|
60
|
+
assert per_instance_stats_item.perturbation is None
|
|
61
|
+
for stat in per_instance_stats_item.stats:
|
|
62
|
+
if performance_metric_matcher.matches(stat.name):
|
|
63
|
+
assert per_instance_stats_item.instance_id not in instance_id_to_performance
|
|
64
|
+
if stat.mean is not None:
|
|
65
|
+
instance_id_to_performance[per_instance_stats_item.instance_id] = stat.mean
|
|
66
|
+
|
|
67
|
+
return instance_id_to_performance
|
|
68
|
+
|
|
69
|
+
def _compute_robustness_for_runs(self, runs: List[Run], performance_metric_matcher: MetricNameMatcher) -> float:
|
|
70
|
+
instance_id_to_performances: Dict[str, List[float]] = {}
|
|
71
|
+
for run in runs:
|
|
72
|
+
for instance_id, performance in self._get_instance_id_to_performance(
|
|
73
|
+
run, performance_metric_matcher
|
|
74
|
+
).items():
|
|
75
|
+
if instance_id not in instance_id_to_performances:
|
|
76
|
+
instance_id_to_performances[instance_id] = []
|
|
77
|
+
instance_id_to_performances[instance_id].append(performance)
|
|
78
|
+
instance_id_to_robustness: Dict[str, float] = {}
|
|
79
|
+
for instance_id, performances in instance_id_to_performances.items():
|
|
80
|
+
instance_id_to_robustness[instance_id] = 1 - (max(performances) - min(performances))
|
|
81
|
+
return sum(instance_id_to_robustness.values()) / len(instance_id_to_robustness.values())
|
|
82
|
+
|
|
83
|
+
def _compute_robustness_for_run_group(self, run_group: RunGroup) -> Dict[str, float]:
|
|
84
|
+
performance_metric_group = self.schema.name_to_metric_group[self.PERFORMANCE_METRIC_GROUP_NAME]
|
|
85
|
+
assert len(performance_metric_group.metrics) == 1
|
|
86
|
+
performance_metric_matcher = performance_metric_group.metrics[0].substitute(run_group.environment)
|
|
87
|
+
|
|
88
|
+
group_runs = [run for run in self.runs if run_group.name in run.run_spec.groups]
|
|
89
|
+
model_name_to_runs: Dict[str, List[Run]] = {}
|
|
90
|
+
|
|
91
|
+
for run in group_runs:
|
|
92
|
+
model_name = run.run_spec.adapter_spec.model
|
|
93
|
+
if model_name not in model_name_to_runs:
|
|
94
|
+
model_name_to_runs[model_name] = []
|
|
95
|
+
model_name_to_runs[run.run_spec.adapter_spec.model].append(run)
|
|
96
|
+
|
|
97
|
+
model_to_robustness: Dict[str, float] = {}
|
|
98
|
+
for model_name, model_runs in model_name_to_runs.items():
|
|
99
|
+
model_to_robustness[model_name] = self._compute_robustness_for_runs(model_runs, performance_metric_matcher)
|
|
100
|
+
return model_to_robustness
|
|
101
|
+
|
|
102
|
+
def write_groups(self):
|
|
103
|
+
for run_group in self.schema.run_groups:
|
|
104
|
+
if self.ROBUSTNESS_METRIC_GROUP_NAME and self.PERFORMANCE_METRIC_GROUP_NAME in run_group.metric_groups:
|
|
105
|
+
self.run_group_to_model_name_to_robustness[run_group.name] = self._compute_robustness_for_run_group(
|
|
106
|
+
run_group
|
|
107
|
+
)
|
|
108
|
+
return super().write_groups()
|
|
109
|
+
|
|
110
|
+
def create_cell(
|
|
111
|
+
self,
|
|
112
|
+
runs: List[Run],
|
|
113
|
+
matcher: MetricNameMatcher,
|
|
114
|
+
additional_info: Optional[str],
|
|
115
|
+
hide_value: bool = False,
|
|
116
|
+
is_scenario_table: bool = False,
|
|
117
|
+
) -> Cell:
|
|
118
|
+
"""
|
|
119
|
+
Use the metric name identified by `matcher` to pull out the stats from
|
|
120
|
+
`runs` and return a representation of the average.
|
|
121
|
+
There are four cases:
|
|
122
|
+
1. No matching runs
|
|
123
|
+
2. Matching runs but no matching stats (maybe stat was named incorrectly)
|
|
124
|
+
3. Matching runs, matching stats, but stats have count = 0, so mean is undefined
|
|
125
|
+
(e.g., bias metric ran and computed 0/0)
|
|
126
|
+
4. Matching runs, matching stats, stats with count > 0
|
|
127
|
+
|
|
128
|
+
In the first three cases, the cell value is None, but the description distinguishes between these cases.
|
|
129
|
+
"""
|
|
130
|
+
if matcher.name != self.ROBUSTNESS_METRIC_NAME:
|
|
131
|
+
return super().create_cell(runs, matcher, additional_info, hide_value, is_scenario_table)
|
|
132
|
+
|
|
133
|
+
if len(runs) == 0:
|
|
134
|
+
return Cell(value=None, description="No matching runs")
|
|
135
|
+
|
|
136
|
+
# Link the runs that this cell was aggregated from, if this is not a scenario table.
|
|
137
|
+
# Scenario tables link to the runs in the model cells,
|
|
138
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
139
|
+
run_spec_names: Optional[List] = None
|
|
140
|
+
if not is_scenario_table:
|
|
141
|
+
# Deduplicate run spec names becuase aggregated_run_specs may have duplicated
|
|
142
|
+
# run specs if a run spec belongs to multiple groups.
|
|
143
|
+
run_spec_names = []
|
|
144
|
+
run_spec_names_set = set()
|
|
145
|
+
for run in runs:
|
|
146
|
+
if run.run_spec.name not in run_spec_names_set:
|
|
147
|
+
run_spec_names.append(run.run_spec.name)
|
|
148
|
+
run_spec_names_set.add(run.run_spec.name)
|
|
149
|
+
|
|
150
|
+
run_group_set = set(runs[0].run_spec.groups) & set(self.run_group_to_model_name_to_robustness.keys())
|
|
151
|
+
assert len(run_group_set) == 1
|
|
152
|
+
run_group = next(iter(run_group_set))
|
|
153
|
+
|
|
154
|
+
model_names_set = set(run.run_spec.adapter_spec.model for run in runs)
|
|
155
|
+
assert len(model_names_set) == 1
|
|
156
|
+
model_name = next(iter(model_names_set))
|
|
157
|
+
|
|
158
|
+
value = (
|
|
159
|
+
self.run_group_to_model_name_to_robustness[run_group][model_name]
|
|
160
|
+
if self.run_group_to_model_name_to_robustness[run_group]
|
|
161
|
+
and self.run_group_to_model_name_to_robustness[run_group][model_name]
|
|
162
|
+
else None
|
|
163
|
+
)
|
|
164
|
+
description = str(round(value, 3)) if value is not None else ""
|
|
165
|
+
if hide_value:
|
|
166
|
+
value = None
|
|
167
|
+
description = ""
|
|
168
|
+
if additional_info:
|
|
169
|
+
description += "\n" + additional_info
|
|
170
|
+
if self.verbose:
|
|
171
|
+
description += "\n-- ".join(["\nRun specs:", *(run_spec_names or [])])
|
|
172
|
+
|
|
173
|
+
return Cell(
|
|
174
|
+
value=value,
|
|
175
|
+
description=description,
|
|
176
|
+
style={},
|
|
177
|
+
run_spec_names=run_spec_names,
|
|
178
|
+
)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from dataclasses import replace
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from helm.benchmark import model_metadata_registry
|
|
7
|
+
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
|
+
from helm.common.general import ensure_directory_exists
|
|
9
|
+
from helm.common.hierarchical_logger import hlog, htrack
|
|
10
|
+
from helm.common.authentication import Authentication
|
|
11
|
+
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
12
|
+
|
|
13
|
+
from helm.benchmark.config_registry import (
|
|
14
|
+
register_configs_from_directory,
|
|
15
|
+
register_builtin_configs_from_helm_package,
|
|
16
|
+
)
|
|
17
|
+
from helm.benchmark.runner import set_benchmark_output_path
|
|
18
|
+
from helm.common.reeval_parameters import REEvalParameters
|
|
19
|
+
from helm.benchmark.run import (
|
|
20
|
+
run_benchmarking,
|
|
21
|
+
validate_args,
|
|
22
|
+
add_run_args,
|
|
23
|
+
run_entries_to_run_specs,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@htrack(None)
|
|
28
|
+
def main():
|
|
29
|
+
parser = argparse.ArgumentParser()
|
|
30
|
+
add_service_args(parser)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"-c",
|
|
33
|
+
"--conf-paths",
|
|
34
|
+
nargs="+",
|
|
35
|
+
help="Where to read RunSpecs to run from",
|
|
36
|
+
default=[],
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--models-to-run",
|
|
40
|
+
nargs="+",
|
|
41
|
+
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
42
|
+
default=None,
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--groups-to-run",
|
|
46
|
+
nargs="+",
|
|
47
|
+
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
48
|
+
default=None,
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--exit-on-error",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--skip-completed-runs",
|
|
57
|
+
action="store_true",
|
|
58
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--priority",
|
|
62
|
+
type=int,
|
|
63
|
+
default=None,
|
|
64
|
+
help="Run RunSpecs with priority less than or equal to this number. "
|
|
65
|
+
"If a value for --priority is not specified, run on everything",
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"--run-specs",
|
|
69
|
+
nargs="*",
|
|
70
|
+
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
71
|
+
"Specifies run entries to run.",
|
|
72
|
+
default=[],
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--enable-huggingface-models",
|
|
77
|
+
nargs="+",
|
|
78
|
+
default=[],
|
|
79
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
80
|
+
"Format: namespace/model_name[@revision]",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--enable-local-huggingface-models",
|
|
84
|
+
nargs="+",
|
|
85
|
+
default=[],
|
|
86
|
+
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
87
|
+
)
|
|
88
|
+
# reeval parameters
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
"--model-ability",
|
|
91
|
+
type=float,
|
|
92
|
+
default=0.0,
|
|
93
|
+
help="The initial ability of the model for reeval evaluation.",
|
|
94
|
+
)
|
|
95
|
+
add_run_args(parser)
|
|
96
|
+
args = parser.parse_args()
|
|
97
|
+
validate_args(args)
|
|
98
|
+
|
|
99
|
+
register_builtin_configs_from_helm_package()
|
|
100
|
+
register_configs_from_directory(args.local_path)
|
|
101
|
+
|
|
102
|
+
if args.enable_huggingface_models:
|
|
103
|
+
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
104
|
+
|
|
105
|
+
for huggingface_model_name in args.enable_huggingface_models:
|
|
106
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
107
|
+
|
|
108
|
+
if args.enable_local_huggingface_models:
|
|
109
|
+
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
110
|
+
|
|
111
|
+
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
112
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
113
|
+
|
|
114
|
+
run_entries: List[RunEntry] = []
|
|
115
|
+
if args.conf_paths:
|
|
116
|
+
run_entries.extend(read_run_entries(args.conf_paths).entries)
|
|
117
|
+
if args.run_entries:
|
|
118
|
+
run_entries.extend(
|
|
119
|
+
[RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
|
|
120
|
+
)
|
|
121
|
+
# TODO: Remove this eventually.
|
|
122
|
+
if args.run_specs:
|
|
123
|
+
run_entries.extend(
|
|
124
|
+
[RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Must set benchmark output path before getting RunSpecs,
|
|
128
|
+
# because run spec functions can use the benchmark output directory for caching.
|
|
129
|
+
ensure_directory_exists(args.output_path)
|
|
130
|
+
set_benchmark_output_path(args.output_path)
|
|
131
|
+
|
|
132
|
+
# Validate the --models-to-run flag
|
|
133
|
+
if args.models_to_run:
|
|
134
|
+
all_models = set(model_metadata_registry.get_all_models())
|
|
135
|
+
for model_to_run in args.models_to_run:
|
|
136
|
+
if model_to_run not in all_models:
|
|
137
|
+
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
138
|
+
else:
|
|
139
|
+
model_expander_pattern = re.compile(
|
|
140
|
+
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
141
|
+
)
|
|
142
|
+
if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
143
|
+
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
144
|
+
|
|
145
|
+
run_specs = run_entries_to_run_specs(
|
|
146
|
+
run_entries=run_entries,
|
|
147
|
+
max_eval_instances=args.max_eval_instances,
|
|
148
|
+
num_train_trials=args.num_train_trials,
|
|
149
|
+
models_to_run=args.models_to_run,
|
|
150
|
+
groups_to_run=args.groups_to_run,
|
|
151
|
+
priority=args.priority,
|
|
152
|
+
)
|
|
153
|
+
hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
|
|
154
|
+
|
|
155
|
+
if len(run_specs) == 0:
|
|
156
|
+
hlog("There were no RunSpecs or they got filtered out.")
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
# Add reeval_parameters
|
|
160
|
+
run_specs = [
|
|
161
|
+
replace(
|
|
162
|
+
run_spec,
|
|
163
|
+
adapter_spec=replace(
|
|
164
|
+
run_spec.adapter_spec, reeval_parameters=REEvalParameters(model_ability=args.model_ability)
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
for run_spec in run_specs
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
auth: Authentication = (
|
|
171
|
+
Authentication("") if args.skip_instances or not args.server_url else create_authentication(args)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
run_benchmarking(
|
|
175
|
+
run_specs=run_specs,
|
|
176
|
+
auth=auth,
|
|
177
|
+
url=args.server_url,
|
|
178
|
+
local_path=args.local_path,
|
|
179
|
+
num_threads=args.num_threads,
|
|
180
|
+
output_path=args.output_path,
|
|
181
|
+
suite=args.suite,
|
|
182
|
+
dry_run=args.dry_run,
|
|
183
|
+
skip_instances=args.skip_instances,
|
|
184
|
+
cache_instances=args.cache_instances,
|
|
185
|
+
cache_instances_only=args.cache_instances_only,
|
|
186
|
+
skip_completed_runs=args.skip_completed_runs,
|
|
187
|
+
exit_on_error=args.exit_on_error,
|
|
188
|
+
runner_class_name="helm.benchmark.reeval_runner.REEvalRunner",
|
|
189
|
+
mongo_uri=args.mongo_uri,
|
|
190
|
+
disable_cache=args.disable_cache,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if args.run_specs:
|
|
194
|
+
hlog(
|
|
195
|
+
"WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
|
|
196
|
+
"Use --run-entries instead."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
hlog("Done.")
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
if __name__ == "__main__":
|
|
203
|
+
main()
|