crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
import dacite
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import typing
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from dataclasses import replace
|
|
11
|
+
from datasets import load_dataset
|
|
12
|
+
|
|
13
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
|
+
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
15
|
+
from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
|
|
16
|
+
from helm.common.cache import cache_stats
|
|
17
|
+
from helm.benchmark.scenarios.scenario import (
|
|
18
|
+
Scenario,
|
|
19
|
+
create_scenario,
|
|
20
|
+
Instance,
|
|
21
|
+
get_scenario_cache_path,
|
|
22
|
+
with_instance_ids,
|
|
23
|
+
)
|
|
24
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
25
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
|
|
26
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
27
|
+
from helm.benchmark.run_spec import RunSpec
|
|
28
|
+
from helm.benchmark.data_preprocessor import DataPreprocessor
|
|
29
|
+
from helm.benchmark.executor import ExecutionSpec
|
|
30
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
31
|
+
from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
|
|
32
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
|
|
33
|
+
from helm.benchmark.runner import (
|
|
34
|
+
Runner,
|
|
35
|
+
remove_stats_nans,
|
|
36
|
+
remove_per_instance_stats_nans,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
scenario_to_metric_name = {
|
|
40
|
+
"air_bench_2024": "air_score",
|
|
41
|
+
"babi_qa": "quasi_exact_match",
|
|
42
|
+
"bbq": "quasi_exact_match",
|
|
43
|
+
"blimp": "exact_match",
|
|
44
|
+
"boolq": "quasi_exact_match",
|
|
45
|
+
"civil_comments": "quasi_exact_match",
|
|
46
|
+
"dyck_language": "exact_match_indicator",
|
|
47
|
+
"entity_data_imputation": "quasi_exact_match",
|
|
48
|
+
"entity_matching": "quasi_exact_match",
|
|
49
|
+
"imdb": "quasi_exact_match",
|
|
50
|
+
"legal_support": "quasi_exact_match",
|
|
51
|
+
"raft": "quasi_exact_match",
|
|
52
|
+
"synthetic_reasoning": "quasi_exact_match",
|
|
53
|
+
"truthful_qa": "exact_match",
|
|
54
|
+
"wikifact": "quasi_exact_match",
|
|
55
|
+
"mmlu": "exact_match",
|
|
56
|
+
"commonsense": "exact_match",
|
|
57
|
+
"gsm": "final_number_exact_match",
|
|
58
|
+
# "gsm": ["exact_match_indicator", "final_number_exact_match"],
|
|
59
|
+
"legalbench": "quasi_exact_match",
|
|
60
|
+
"math": "math_equiv_chain_of_thought",
|
|
61
|
+
"med_qa": "quasi_exact_match",
|
|
62
|
+
"thai_exam": "exact_match",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class REEvalRunner(Runner):
|
|
67
|
+
"""
|
|
68
|
+
This runner implements the basic (non-amortized) method described in the paper
|
|
69
|
+
`Reliable and Efficient Amortized Model-Based Evaluation`. This approach, which is
|
|
70
|
+
also known as Computerized Adaptive Testing (CAT) within the framework of Item Response
|
|
71
|
+
Theory (IRT), leverages adaptive testing to evaluate model performance.
|
|
72
|
+
|
|
73
|
+
The difficulties of the questions are provided in a HuggingFace repository. In addition,
|
|
74
|
+
the authors of the paper will supply a Python package for calculating these difficulties.
|
|
75
|
+
At each iteration, the runner estimates the model's ability based on all previously
|
|
76
|
+
administered questions and their corresponding responses. It then selects the next question
|
|
77
|
+
whose difficulty is closest to the estimated ability, thereby reliably and efficiently
|
|
78
|
+
eliciting the model's ability.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
execution_spec: ExecutionSpec,
|
|
84
|
+
output_path: str,
|
|
85
|
+
suite: str,
|
|
86
|
+
skip_instances: bool,
|
|
87
|
+
cache_instances: bool,
|
|
88
|
+
cache_instances_only: bool,
|
|
89
|
+
skip_completed_runs: bool,
|
|
90
|
+
exit_on_error: bool,
|
|
91
|
+
):
|
|
92
|
+
super().__init__(
|
|
93
|
+
execution_spec=execution_spec,
|
|
94
|
+
output_path=output_path,
|
|
95
|
+
suite=suite,
|
|
96
|
+
skip_instances=skip_instances,
|
|
97
|
+
cache_instances=cache_instances,
|
|
98
|
+
cache_instances_only=cache_instances_only,
|
|
99
|
+
skip_completed_runs=skip_completed_runs,
|
|
100
|
+
exit_on_error=exit_on_error,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def _estimate_model_ability(
|
|
104
|
+
self,
|
|
105
|
+
old_ability: float,
|
|
106
|
+
response_correctness: List[float],
|
|
107
|
+
instance_difficulties: List[float],
|
|
108
|
+
) -> float:
|
|
109
|
+
def closure():
|
|
110
|
+
optim.zero_grad()
|
|
111
|
+
probs = torch.sigmoid(ability + difficulties)
|
|
112
|
+
loss = -torch.distributions.Bernoulli(probs=probs).log_prob(responses).mean()
|
|
113
|
+
loss.backward()
|
|
114
|
+
return loss
|
|
115
|
+
|
|
116
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
117
|
+
responses = torch.tensor(response_correctness, device=device)
|
|
118
|
+
difficulties = torch.tensor(instance_difficulties, device=device)
|
|
119
|
+
ability = torch.tensor([old_ability], requires_grad=True, device=device)
|
|
120
|
+
optim = torch.optim.LBFGS([ability], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe")
|
|
121
|
+
|
|
122
|
+
for iteration in range(100):
|
|
123
|
+
loss = optim.step(closure)
|
|
124
|
+
|
|
125
|
+
if iteration > 0:
|
|
126
|
+
prev_ability = ability.clone()
|
|
127
|
+
prev_loss = loss
|
|
128
|
+
d_loss = prev_loss - loss
|
|
129
|
+
d_theta = torch.norm(prev_ability - ability, p=2)
|
|
130
|
+
grad_norm = torch.norm(optim.param_groups[0]["params"][0].grad, p=2)
|
|
131
|
+
if d_loss < 1e-5 and d_theta < 1e-5 and grad_norm < 1e-5:
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
return ability.item()
|
|
135
|
+
|
|
136
|
+
def run_one(self, run_spec: RunSpec):
|
|
137
|
+
run_path: str = self._get_run_path(run_spec)
|
|
138
|
+
if self.skip_completed_runs and self._is_run_completed(run_path):
|
|
139
|
+
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
140
|
+
return
|
|
141
|
+
ensure_directory_exists(run_path)
|
|
142
|
+
|
|
143
|
+
# Load the scenario
|
|
144
|
+
scenario: Scenario = create_scenario(run_spec.scenario_spec)
|
|
145
|
+
|
|
146
|
+
# This 'output_path' will be used when the model's input instances are saved.
|
|
147
|
+
args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
|
|
148
|
+
scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
|
|
149
|
+
input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
|
|
150
|
+
input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
|
|
151
|
+
|
|
152
|
+
instances: List[Instance]
|
|
153
|
+
if self.skip_instances:
|
|
154
|
+
instances = []
|
|
155
|
+
else:
|
|
156
|
+
if self.cache_instances and os.path.exists(input_instances_file_path):
|
|
157
|
+
with open(input_instances_file_path) as f:
|
|
158
|
+
json_instances: List[Dict[str, Any]] = json.load(f)
|
|
159
|
+
instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
|
|
160
|
+
else:
|
|
161
|
+
# Create the instances of the scenario
|
|
162
|
+
scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
|
|
163
|
+
with htrack_block("scenario.get_instances"):
|
|
164
|
+
instances = scenario.get_instances(scenario_output_path)
|
|
165
|
+
if self.cache_instances and not os.path.exists(input_instances_file_path):
|
|
166
|
+
# Save instances to file
|
|
167
|
+
ensure_directory_exists(input_instances_output_path)
|
|
168
|
+
write(
|
|
169
|
+
os.path.join(input_instances_file_path),
|
|
170
|
+
json.dumps([asdict_without_nones(instance) for instance in instances], indent=2),
|
|
171
|
+
)
|
|
172
|
+
if self.cache_instances_only:
|
|
173
|
+
return # Exit after saving the instances.
|
|
174
|
+
|
|
175
|
+
# Give each instance a unique ID
|
|
176
|
+
if any([instance.id is None for instance in instances]):
|
|
177
|
+
instances = with_instance_ids(instances)
|
|
178
|
+
|
|
179
|
+
# Data preprocessing
|
|
180
|
+
instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
|
|
181
|
+
instances, self.executor.execution_spec.parallelism
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Adapt (convert to requests)
|
|
185
|
+
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
186
|
+
unasked_request_states_without_difficulty: List[RequestState] = adapter.adapt(
|
|
187
|
+
instances, self.executor.execution_spec.parallelism
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# load difficulty
|
|
191
|
+
split_name = "dyck_language_np_3" if scenario.name == "dyck_language" else scenario.name
|
|
192
|
+
try:
|
|
193
|
+
difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
|
|
194
|
+
prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
|
|
195
|
+
except ValueError:
|
|
196
|
+
hwarn(f"no available difficulty for {split_name}, skipping")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
unasked_request_states: List[RequestState] = []
|
|
200
|
+
for request_state in unasked_request_states_without_difficulty:
|
|
201
|
+
prompt = request_state.request.prompt
|
|
202
|
+
if prompt in prompt_to_difficulty:
|
|
203
|
+
difficulty = prompt_to_difficulty[prompt]
|
|
204
|
+
current_extra_data = request_state.instance.extra_data or {}
|
|
205
|
+
if "difficulty" in current_extra_data:
|
|
206
|
+
raise Exception("Extra_data already contains a 'difficulty' key.")
|
|
207
|
+
new_extra_data = current_extra_data.copy()
|
|
208
|
+
new_extra_data["difficulty"] = difficulty
|
|
209
|
+
new_instance = replace(request_state.instance, extra_data=new_extra_data)
|
|
210
|
+
new_request_state = replace(request_state, instance=new_instance)
|
|
211
|
+
unasked_request_states.append(new_request_state)
|
|
212
|
+
assert unasked_request_states
|
|
213
|
+
|
|
214
|
+
# Execute the requests in an reeval manner
|
|
215
|
+
assert run_spec.adapter_spec.reeval_parameters is not None
|
|
216
|
+
model_ability: float = run_spec.adapter_spec.reeval_parameters.model_ability or 0.0
|
|
217
|
+
scenario_metric_name: str = scenario_to_metric_name[scenario.name]
|
|
218
|
+
|
|
219
|
+
asked_request_states: List[RequestState] = []
|
|
220
|
+
reeval_trajectory: Dict[str, List[float]] = {
|
|
221
|
+
"model_ability": [],
|
|
222
|
+
"response_correctness": [],
|
|
223
|
+
"instance_difficulties": [],
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
assert run_spec.adapter_spec.max_eval_instances is not None
|
|
227
|
+
for _ in tqdm(range(run_spec.adapter_spec.max_eval_instances), desc="REEval Execution"):
|
|
228
|
+
if not unasked_request_states:
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
selected_item: Optional[RequestState] = None
|
|
232
|
+
min_diff = float("inf")
|
|
233
|
+
for item in unasked_request_states:
|
|
234
|
+
assert item.instance.extra_data is not None
|
|
235
|
+
diff = abs(item.instance.extra_data["difficulty"] + model_ability)
|
|
236
|
+
if diff < min_diff:
|
|
237
|
+
min_diff = diff
|
|
238
|
+
selected_item = item
|
|
239
|
+
assert selected_item is not None
|
|
240
|
+
unasked_request_states.remove(selected_item)
|
|
241
|
+
|
|
242
|
+
# Execute the request
|
|
243
|
+
single_scenario_state: ScenarioState = ScenarioState(
|
|
244
|
+
adapter_spec=run_spec.adapter_spec,
|
|
245
|
+
request_states=[selected_item],
|
|
246
|
+
annotator_specs=run_spec.annotators,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Execute (fill up results)
|
|
250
|
+
single_scenario_state = self.executor.execute(single_scenario_state)
|
|
251
|
+
|
|
252
|
+
# Annotate (post-process the results)
|
|
253
|
+
single_scenario_state = self.annotator_executor.execute(single_scenario_state)
|
|
254
|
+
|
|
255
|
+
# Apply the metrics
|
|
256
|
+
# When performing a dry run, only estimate the number of tokens instead
|
|
257
|
+
# of calculating the metrics.
|
|
258
|
+
metrics: List[MetricInterface] = (
|
|
259
|
+
[DryRunMetric()]
|
|
260
|
+
if self.dry_run
|
|
261
|
+
else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
temp_per_instance_stats: List[PerInstanceStats] = []
|
|
265
|
+
with htrack_block(f"{len(metrics)} metrics"):
|
|
266
|
+
for metric in metrics:
|
|
267
|
+
with htrack_block(metric):
|
|
268
|
+
temp_metric_result: MetricResult = metric.evaluate(
|
|
269
|
+
single_scenario_state,
|
|
270
|
+
self.metric_service,
|
|
271
|
+
self.eval_cache_path,
|
|
272
|
+
self.executor.execution_spec.parallelism,
|
|
273
|
+
)
|
|
274
|
+
temp_per_instance_stats.extend(temp_metric_result.per_instance_stats)
|
|
275
|
+
|
|
276
|
+
# Update the reeval request states
|
|
277
|
+
asked_request_states.extend(single_scenario_state.request_states)
|
|
278
|
+
|
|
279
|
+
# Update the reeval trajectory
|
|
280
|
+
reeval_trajectory["model_ability"].append(model_ability)
|
|
281
|
+
scenario_metric_value = [
|
|
282
|
+
s for s in temp_per_instance_stats[0].stats if s.name.name == scenario_metric_name
|
|
283
|
+
][0].mean
|
|
284
|
+
|
|
285
|
+
assert scenario_metric_value is not None
|
|
286
|
+
reeval_trajectory["response_correctness"].append(scenario_metric_value)
|
|
287
|
+
assert selected_item.instance.extra_data is not None
|
|
288
|
+
reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
|
|
289
|
+
|
|
290
|
+
# Estimate the model ability
|
|
291
|
+
model_ability = self._estimate_model_ability(
|
|
292
|
+
old_ability=model_ability,
|
|
293
|
+
response_correctness=reeval_trajectory["response_correctness"],
|
|
294
|
+
instance_difficulties=reeval_trajectory["instance_difficulties"],
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Create the scenario state
|
|
298
|
+
scenario_state: ScenarioState = ScenarioState(
|
|
299
|
+
adapter_spec=run_spec.adapter_spec,
|
|
300
|
+
request_states=asked_request_states,
|
|
301
|
+
annotator_specs=run_spec.annotators,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
stats: List[Stat] = []
|
|
305
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
306
|
+
with htrack_block(f"{len(metrics)} metrics"):
|
|
307
|
+
for metric in metrics:
|
|
308
|
+
with htrack_block(metric):
|
|
309
|
+
metric_result: MetricResult = metric.evaluate(
|
|
310
|
+
scenario_state,
|
|
311
|
+
self.metric_service,
|
|
312
|
+
self.eval_cache_path,
|
|
313
|
+
self.executor.execution_spec.parallelism,
|
|
314
|
+
)
|
|
315
|
+
stats.extend(metric_result.aggregated_stats)
|
|
316
|
+
per_instance_stats.extend(metric_result.per_instance_stats)
|
|
317
|
+
|
|
318
|
+
# Check that there aren't duplicate `Stat`s
|
|
319
|
+
# Note: doesn't catch near misses.
|
|
320
|
+
metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
|
|
321
|
+
for metric_name, count in metric_counts.items():
|
|
322
|
+
if count > 1:
|
|
323
|
+
hwarn(f"duplicate metric name {metric_name}")
|
|
324
|
+
|
|
325
|
+
# Print out the number of stats
|
|
326
|
+
hlog(f"Generated {len(stats)} stats.")
|
|
327
|
+
|
|
328
|
+
if self.skip_instances:
|
|
329
|
+
hlog("skip_instances was True. Skipping writing results out.")
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
# Output benchmarking information and results to files
|
|
333
|
+
write(os.path.join(run_path, "run_spec.json"), json.dumps(asdict_without_nones(run_spec), indent=2))
|
|
334
|
+
|
|
335
|
+
# Write out scenario
|
|
336
|
+
write(os.path.join(run_path, "scenario.json"), json.dumps(asdict_without_nones(scenario), indent=2))
|
|
337
|
+
|
|
338
|
+
# Write scenario state
|
|
339
|
+
write(os.path.join(run_path, "scenario_state.json"), json.dumps(asdict_without_nones(scenario_state), indent=2))
|
|
340
|
+
|
|
341
|
+
write(
|
|
342
|
+
os.path.join(run_path, "stats.json"),
|
|
343
|
+
json.dumps([asdict_without_nones(stat) for stat in remove_stats_nans(stats)], indent=2),
|
|
344
|
+
)
|
|
345
|
+
write(
|
|
346
|
+
os.path.join(run_path, "per_instance_stats.json"),
|
|
347
|
+
json.dumps(list(map(asdict_without_nones, remove_per_instance_stats_nans(per_instance_stats))), indent=2),
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
write(
|
|
351
|
+
os.path.join(run_path, "reeval_trajectory.json"),
|
|
352
|
+
json.dumps(reeval_trajectory, indent=2),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
cache_stats.print_status()
|
helm/benchmark/run.py
CHANGED
|
@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
|
|
|
9
9
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
10
10
|
from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
11
11
|
from helm.common.general import ensure_directory_exists
|
|
12
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
12
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
|
|
13
13
|
from helm.common.authentication import Authentication
|
|
14
14
|
from helm.common.object_spec import parse_object_spec, get_class_by_name
|
|
15
15
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
@@ -200,83 +200,9 @@ def validate_args(args):
|
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
@htrack(None)
|
|
203
|
-
def
|
|
204
|
-
parser = argparse.ArgumentParser()
|
|
205
|
-
add_service_args(parser)
|
|
206
|
-
parser.add_argument(
|
|
207
|
-
"-c",
|
|
208
|
-
"--conf-paths",
|
|
209
|
-
nargs="+",
|
|
210
|
-
help="Where to read RunSpecs to run from",
|
|
211
|
-
default=[],
|
|
212
|
-
)
|
|
213
|
-
parser.add_argument(
|
|
214
|
-
"--models-to-run",
|
|
215
|
-
nargs="+",
|
|
216
|
-
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
217
|
-
default=None,
|
|
218
|
-
)
|
|
219
|
-
parser.add_argument(
|
|
220
|
-
"--groups-to-run",
|
|
221
|
-
nargs="+",
|
|
222
|
-
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
223
|
-
default=None,
|
|
224
|
-
)
|
|
225
|
-
parser.add_argument(
|
|
226
|
-
"--exit-on-error",
|
|
227
|
-
action="store_true",
|
|
228
|
-
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
229
|
-
)
|
|
230
|
-
parser.add_argument(
|
|
231
|
-
"--skip-completed-runs",
|
|
232
|
-
action="store_true",
|
|
233
|
-
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
234
|
-
)
|
|
235
|
-
parser.add_argument(
|
|
236
|
-
"--priority",
|
|
237
|
-
type=int,
|
|
238
|
-
default=None,
|
|
239
|
-
help="Run RunSpecs with priority less than or equal to this number. "
|
|
240
|
-
"If a value for --priority is not specified, run on everything",
|
|
241
|
-
)
|
|
242
|
-
parser.add_argument(
|
|
243
|
-
"--run-specs",
|
|
244
|
-
nargs="*",
|
|
245
|
-
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
246
|
-
"Specifies run entries to run.",
|
|
247
|
-
default=[],
|
|
248
|
-
)
|
|
249
|
-
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
250
|
-
parser.add_argument(
|
|
251
|
-
"--enable-huggingface-models",
|
|
252
|
-
nargs="+",
|
|
253
|
-
default=[],
|
|
254
|
-
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
255
|
-
"Format: namespace/model_name[@revision]",
|
|
256
|
-
)
|
|
257
|
-
parser.add_argument(
|
|
258
|
-
"--enable-local-huggingface-models",
|
|
259
|
-
nargs="+",
|
|
260
|
-
default=[],
|
|
261
|
-
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
262
|
-
)
|
|
263
|
-
parser.add_argument(
|
|
264
|
-
"--runner-class-name",
|
|
265
|
-
type=str,
|
|
266
|
-
default=None,
|
|
267
|
-
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
268
|
-
)
|
|
269
|
-
parser.add_argument(
|
|
270
|
-
"--openvino",
|
|
271
|
-
action="store_true",
|
|
272
|
-
default=False,
|
|
273
|
-
help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
|
|
274
|
-
"specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
|
|
275
|
-
)
|
|
276
|
-
add_run_args(parser)
|
|
277
|
-
args = parser.parse_args()
|
|
278
|
-
validate_args(args)
|
|
203
|
+
def helm_run(args):
|
|
279
204
|
|
|
205
|
+
validate_args(args)
|
|
280
206
|
register_builtin_configs_from_helm_package()
|
|
281
207
|
register_configs_from_directory(args.local_path)
|
|
282
208
|
|
|
@@ -284,19 +210,13 @@ def main():
|
|
|
284
210
|
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
285
211
|
|
|
286
212
|
for huggingface_model_name in args.enable_huggingface_models:
|
|
287
|
-
|
|
288
|
-
register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
|
|
289
|
-
else:
|
|
290
|
-
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
213
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
291
214
|
|
|
292
215
|
if args.enable_local_huggingface_models:
|
|
293
216
|
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
294
217
|
|
|
295
218
|
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
296
|
-
|
|
297
|
-
register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
|
|
298
|
-
else:
|
|
299
|
-
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
219
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
300
220
|
|
|
301
221
|
run_entries: List[RunEntry] = []
|
|
302
222
|
if args.conf_paths:
|
|
@@ -323,12 +243,16 @@ def main():
|
|
|
323
243
|
if model_to_run not in all_models:
|
|
324
244
|
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
325
245
|
else:
|
|
326
|
-
|
|
246
|
+
model_expander_wildcard_pattern = re.compile(
|
|
327
247
|
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
328
248
|
)
|
|
329
|
-
if any(
|
|
249
|
+
if any(model_expander_wildcard_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
330
250
|
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
331
251
|
|
|
252
|
+
model_expander_pattern = re.compile(r"\bmodel=\b")
|
|
253
|
+
if not any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
254
|
+
raise Exception("--models-to-run must be set if the `models=` run expander is omitted")
|
|
255
|
+
|
|
332
256
|
run_specs = run_entries_to_run_specs(
|
|
333
257
|
run_entries=run_entries,
|
|
334
258
|
max_eval_instances=args.max_eval_instances,
|
|
@@ -367,13 +291,85 @@ def main():
|
|
|
367
291
|
)
|
|
368
292
|
|
|
369
293
|
if args.run_specs:
|
|
370
|
-
|
|
371
|
-
"
|
|
372
|
-
"Use --run-entries instead."
|
|
294
|
+
hwarn(
|
|
295
|
+
"The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
|
|
373
296
|
)
|
|
374
297
|
|
|
375
298
|
hlog("Done.")
|
|
376
299
|
|
|
377
300
|
|
|
301
|
+
# Separate parsing from starting HELM so we can setup logging
|
|
302
|
+
def main():
|
|
303
|
+
parser = argparse.ArgumentParser()
|
|
304
|
+
add_service_args(parser)
|
|
305
|
+
parser.add_argument(
|
|
306
|
+
"-c",
|
|
307
|
+
"--conf-paths",
|
|
308
|
+
nargs="+",
|
|
309
|
+
help="Where to read RunSpecs to run from",
|
|
310
|
+
default=[],
|
|
311
|
+
)
|
|
312
|
+
parser.add_argument(
|
|
313
|
+
"--models-to-run",
|
|
314
|
+
nargs="+",
|
|
315
|
+
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
316
|
+
default=None,
|
|
317
|
+
)
|
|
318
|
+
parser.add_argument(
|
|
319
|
+
"--groups-to-run",
|
|
320
|
+
nargs="+",
|
|
321
|
+
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
322
|
+
default=None,
|
|
323
|
+
)
|
|
324
|
+
parser.add_argument(
|
|
325
|
+
"--exit-on-error",
|
|
326
|
+
action="store_true",
|
|
327
|
+
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
328
|
+
)
|
|
329
|
+
parser.add_argument(
|
|
330
|
+
"--skip-completed-runs",
|
|
331
|
+
action="store_true",
|
|
332
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
333
|
+
)
|
|
334
|
+
parser.add_argument(
|
|
335
|
+
"--priority",
|
|
336
|
+
type=int,
|
|
337
|
+
default=None,
|
|
338
|
+
help="Run RunSpecs with priority less than or equal to this number. "
|
|
339
|
+
"If a value for --priority is not specified, run on everything",
|
|
340
|
+
)
|
|
341
|
+
parser.add_argument(
|
|
342
|
+
"--run-specs",
|
|
343
|
+
nargs="*",
|
|
344
|
+
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
345
|
+
"Specifies run entries to run.",
|
|
346
|
+
default=[],
|
|
347
|
+
)
|
|
348
|
+
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
349
|
+
parser.add_argument(
|
|
350
|
+
"--enable-huggingface-models",
|
|
351
|
+
nargs="+",
|
|
352
|
+
default=[],
|
|
353
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
354
|
+
"Format: namespace/model_name[@revision]",
|
|
355
|
+
)
|
|
356
|
+
parser.add_argument(
|
|
357
|
+
"--enable-local-huggingface-models",
|
|
358
|
+
nargs="+",
|
|
359
|
+
default=[],
|
|
360
|
+
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
361
|
+
)
|
|
362
|
+
parser.add_argument(
|
|
363
|
+
"--runner-class-name",
|
|
364
|
+
type=str,
|
|
365
|
+
default=None,
|
|
366
|
+
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
367
|
+
)
|
|
368
|
+
add_run_args(parser)
|
|
369
|
+
args = parser.parse_args()
|
|
370
|
+
setup_default_logging()
|
|
371
|
+
return helm_run(args)
|
|
372
|
+
|
|
373
|
+
|
|
378
374
|
if __name__ == "__main__":
|
|
379
375
|
main()
|