crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: num_prompt_tokens
|
|
19
|
+
display_name: '# prompt tokens'
|
|
20
|
+
description: Number of tokens in the prompt.
|
|
21
|
+
- name: num_completion_tokens
|
|
22
|
+
display_name: '# completion tokens'
|
|
23
|
+
description: Actual number of completion tokens (over all completions).
|
|
24
|
+
- name: num_output_tokens
|
|
25
|
+
display_name: '# output tokens'
|
|
26
|
+
description: Actual number of output tokens.
|
|
27
|
+
- name: num_instances
|
|
28
|
+
display_name: '# eval'
|
|
29
|
+
description: Number of evaluation instances.
|
|
30
|
+
- name: num_train_instances
|
|
31
|
+
display_name: '# train'
|
|
32
|
+
description: Number of training instances (e.g., in-context examples).
|
|
33
|
+
- name: prompt_truncated
|
|
34
|
+
display_name: truncated
|
|
35
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
36
|
+
- name: finish_reason_length
|
|
37
|
+
display_name: finish b/c length
|
|
38
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
39
|
+
- name: finish_reason_stop
|
|
40
|
+
display_name: finish b/c stop
|
|
41
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
42
|
+
- name: finish_reason_endoftext
|
|
43
|
+
display_name: finish b/c endoftext
|
|
44
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
45
|
+
- name: finish_reason_unknown
|
|
46
|
+
display_name: finish b/c unknown
|
|
47
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
48
|
+
# Accuracy metrics:
|
|
49
|
+
- name: exact_match
|
|
50
|
+
display_name: Exact match
|
|
51
|
+
short_display_name: EM
|
|
52
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
53
|
+
lower_is_better: false
|
|
54
|
+
- name: quasi_exact_match
|
|
55
|
+
display_name: Quasi-exact match
|
|
56
|
+
short_display_name: EM
|
|
57
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
58
|
+
lower_is_better: false
|
|
59
|
+
- name: rouge_1
|
|
60
|
+
display_name: ROUGE-1
|
|
61
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
62
|
+
lower_is_better: false
|
|
63
|
+
- name: rouge_2
|
|
64
|
+
display_name: ROUGE-2
|
|
65
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
66
|
+
lower_is_better: false
|
|
67
|
+
- name: rouge_l
|
|
68
|
+
display_name: ROUGE-L
|
|
69
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
70
|
+
lower_is_better: false
|
|
71
|
+
- name: annotation_autobencher_capabilities_score
|
|
72
|
+
display_name: Correct
|
|
73
|
+
description: Model-judged correctness for AutoBencher Capabilities
|
|
74
|
+
lower_is_better: false
|
|
75
|
+
|
|
76
|
+
############################################################
|
|
77
|
+
perturbations: []
|
|
78
|
+
|
|
79
|
+
############################################################
|
|
80
|
+
metric_groups:
|
|
81
|
+
- name: accuracy
|
|
82
|
+
display_name: Accuracy
|
|
83
|
+
metrics:
|
|
84
|
+
- name: ${main_name}
|
|
85
|
+
split: ${main_split}
|
|
86
|
+
|
|
87
|
+
- name: efficiency
|
|
88
|
+
display_name: Efficiency
|
|
89
|
+
metrics:
|
|
90
|
+
- name: inference_runtime
|
|
91
|
+
split: ${main_split}
|
|
92
|
+
|
|
93
|
+
- name: general_information
|
|
94
|
+
display_name: General information
|
|
95
|
+
hide_win_rates: true
|
|
96
|
+
metrics:
|
|
97
|
+
- name: num_instances
|
|
98
|
+
split: ${main_split}
|
|
99
|
+
- name: num_train_instances
|
|
100
|
+
split: ${main_split}
|
|
101
|
+
- name: prompt_truncated
|
|
102
|
+
split: ${main_split}
|
|
103
|
+
- name: num_prompt_tokens
|
|
104
|
+
split: ${main_split}
|
|
105
|
+
- name: num_output_tokens
|
|
106
|
+
split: ${main_split}
|
|
107
|
+
|
|
108
|
+
############################################################
|
|
109
|
+
run_groups:
|
|
110
|
+
- name: autobencher_scenarios
|
|
111
|
+
display_name: AutoBencher Scenarios
|
|
112
|
+
description: AutoBencher Scenarios
|
|
113
|
+
category: All scenarios
|
|
114
|
+
subgroups:
|
|
115
|
+
- autobencher_capabilities
|
|
116
|
+
- autobencher_safety
|
|
117
|
+
|
|
118
|
+
- name: autobencher_capabilities
|
|
119
|
+
display_name: AutoBencher Capabilities
|
|
120
|
+
description: AutoBencher Capabilities consists of question answering datasets for math, multilingual, and knowledge-intensive question answering created by AutoBencher. ([paper](https://arxiv.org/abs/2407.08351))
|
|
121
|
+
metric_groups:
|
|
122
|
+
- accuracy
|
|
123
|
+
- efficiency
|
|
124
|
+
- general_information
|
|
125
|
+
environment:
|
|
126
|
+
main_name: annotation_autobencher_capabilities_score
|
|
127
|
+
main_split: test
|
|
128
|
+
taxonomy:
|
|
129
|
+
task: question answering
|
|
130
|
+
what: questions about various
|
|
131
|
+
who: synthetic model-generated questions
|
|
132
|
+
when: "2024"
|
|
133
|
+
language: English and various languages
|
|
134
|
+
|
|
135
|
+
- name: autobencher_safety
|
|
136
|
+
display_name: AutoBencher Safety
|
|
137
|
+
description: AutoBencher Safety consists of harmful question answering datasets created by AutoBencher. ([paper](https://arxiv.org/abs/2407.08351))
|
|
138
|
+
metric_groups:
|
|
139
|
+
- accuracy
|
|
140
|
+
- efficiency
|
|
141
|
+
- general_information
|
|
142
|
+
environment:
|
|
143
|
+
main_name: annotation_autobencher_capabilities_score
|
|
144
|
+
main_split: test
|
|
145
|
+
taxonomy:
|
|
146
|
+
task: question answering
|
|
147
|
+
what: harmful questions
|
|
148
|
+
who: synthetic model-generated questions
|
|
149
|
+
when: "2024"
|
|
150
|
+
language: English and various languages
|
|
@@ -99,6 +99,21 @@ metrics:
|
|
|
99
99
|
short_display_name: Recall
|
|
100
100
|
description: How many key items were recalled
|
|
101
101
|
lower_is_better: false
|
|
102
|
+
|
|
103
|
+
- name: annotation_helpdesk_call_center_summarization_score
|
|
104
|
+
display_name: Score
|
|
105
|
+
short_display_name: Score
|
|
106
|
+
description: Score
|
|
107
|
+
lower_is_better: false
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
- name: call_summarization_score
|
|
111
|
+
display_name: Score
|
|
112
|
+
short_display_name: Score
|
|
113
|
+
description: Score
|
|
114
|
+
lower_is_better: false
|
|
115
|
+
|
|
116
|
+
|
|
102
117
|
############################################################
|
|
103
118
|
perturbations: []
|
|
104
119
|
|
|
@@ -108,12 +123,16 @@ metric_groups:
|
|
|
108
123
|
display_name: Summarization
|
|
109
124
|
hide_win_rates: true
|
|
110
125
|
metrics:
|
|
111
|
-
- name:
|
|
112
|
-
split: ${main_split}
|
|
113
|
-
- name: annotation_call_center_summarization_relevance
|
|
114
|
-
split: ${main_split}
|
|
115
|
-
- name: annotation_call_center_summarization_coherence
|
|
126
|
+
- name: call_summarization_score
|
|
116
127
|
split: ${main_split}
|
|
128
|
+
# - name: annotation_helpdesk_call_center_summarization_score
|
|
129
|
+
# split: ${main_split}
|
|
130
|
+
# - name: annotation_call_center_summarization_faithfulness
|
|
131
|
+
# split: ${main_split}
|
|
132
|
+
# - name: annotation_call_center_summarization_relevance
|
|
133
|
+
# split: ${main_split}
|
|
134
|
+
# - name: annotation_call_center_summarization_coherence
|
|
135
|
+
# split: ${main_split}
|
|
117
136
|
|
|
118
137
|
- name: pairwise_comparison_metrics
|
|
119
138
|
display_name: Pairwise Comparison
|
|
@@ -158,31 +177,32 @@ run_groups:
|
|
|
158
177
|
description: Scenarios representating realistic tasks from the call center.
|
|
159
178
|
category: All scenarios
|
|
160
179
|
subgroups:
|
|
161
|
-
-
|
|
162
|
-
-
|
|
163
|
-
-
|
|
164
|
-
-
|
|
180
|
+
- helpdesk_call_summarization
|
|
181
|
+
# - call_center_summarization
|
|
182
|
+
# - call_center_summarization_real_call_transcripts
|
|
183
|
+
# - call_center_summarization_pairwise_comparison
|
|
184
|
+
# - call_center_summarization_key_points_recall
|
|
165
185
|
|
|
166
|
-
- name: call_center_summarization
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
186
|
+
# - name: call_center_summarization
|
|
187
|
+
# display_name: Summarization
|
|
188
|
+
# description: summarization
|
|
189
|
+
# metric_groups:
|
|
190
|
+
# # - accuracy
|
|
191
|
+
# - summarization_metrics
|
|
192
|
+
# - efficiency
|
|
193
|
+
# - general_information
|
|
194
|
+
# environment:
|
|
195
|
+
# main_split: test
|
|
196
|
+
# taxonomy:
|
|
197
|
+
# task: summarization
|
|
198
|
+
# what: n/a
|
|
199
|
+
# who: n/a
|
|
200
|
+
# when: "?"
|
|
201
|
+
# language: English
|
|
182
202
|
|
|
183
|
-
- name:
|
|
184
|
-
display_name:
|
|
185
|
-
description:
|
|
203
|
+
- name: helpdesk_call_summarization
|
|
204
|
+
display_name: Helpdesk Call summarization
|
|
205
|
+
description: Helpdesk Call summarization
|
|
186
206
|
metric_groups:
|
|
187
207
|
# - accuracy
|
|
188
208
|
- summarization_metrics
|
|
@@ -197,36 +217,53 @@ run_groups:
|
|
|
197
217
|
when: "?"
|
|
198
218
|
language: English
|
|
199
219
|
|
|
200
|
-
- name:
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
220
|
+
# - name: call_center_summarization_real_call_transcripts
|
|
221
|
+
# display_name: Summarization (Real)
|
|
222
|
+
# description: Summarization with real call transcripts
|
|
223
|
+
# metric_groups:
|
|
224
|
+
# # - accuracy
|
|
225
|
+
# - summarization_metrics
|
|
226
|
+
# - efficiency
|
|
227
|
+
# - general_information
|
|
228
|
+
# environment:
|
|
229
|
+
# main_split: test
|
|
230
|
+
# taxonomy:
|
|
231
|
+
# task: summarization
|
|
232
|
+
# what: n/a
|
|
233
|
+
# who: n/a
|
|
234
|
+
# when: "?"
|
|
235
|
+
# language: English
|
|
216
236
|
|
|
217
|
-
- name:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
237
|
+
# - name: call_center_summarization_pairwise_comparison
|
|
238
|
+
# display_name: Summarization (Pairwise)
|
|
239
|
+
# description: summarization
|
|
240
|
+
# metric_groups:
|
|
241
|
+
# # - accuracy
|
|
242
|
+
# - pairwise_comparison_metrics
|
|
243
|
+
# - efficiency
|
|
244
|
+
# - general_information
|
|
245
|
+
# environment:
|
|
246
|
+
# main_split: test
|
|
247
|
+
# taxonomy:
|
|
248
|
+
# task: summarization
|
|
249
|
+
# what: n/a
|
|
250
|
+
# who: n/a
|
|
251
|
+
# when: "?"
|
|
252
|
+
# language: English
|
|
253
|
+
|
|
254
|
+
# - name: call_center_summarization_key_points_recall
|
|
255
|
+
# display_name: Summarization (Key Points Recall)
|
|
256
|
+
# description: summarization
|
|
257
|
+
# metric_groups:
|
|
258
|
+
# # - accuracy
|
|
259
|
+
# - key_points_recall_metrics
|
|
260
|
+
# - efficiency
|
|
261
|
+
# - general_information
|
|
262
|
+
# environment:
|
|
263
|
+
# main_split: test
|
|
264
|
+
# taxonomy:
|
|
265
|
+
# task: summarization
|
|
266
|
+
# what: n/a
|
|
267
|
+
# who: n/a
|
|
268
|
+
# when: "?"
|
|
269
|
+
# language: English
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
---
|
|
2
|
+
# EXPERIMENTAL: DO NOT USE IN PROD
|
|
3
|
+
# Temporary schema for prototyping HELM Lite v2
|
|
4
|
+
############################################################
|
|
5
|
+
metrics:
|
|
6
|
+
# Infrastructure metrics:
|
|
7
|
+
- name: num_perplexity_tokens
|
|
8
|
+
display_name: '# tokens'
|
|
9
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
10
|
+
- name: num_bytes
|
|
11
|
+
display_name: '# bytes'
|
|
12
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
13
|
+
|
|
14
|
+
- name: num_references
|
|
15
|
+
display_name: '# ref'
|
|
16
|
+
description: Number of references.
|
|
17
|
+
- name: num_train_trials
|
|
18
|
+
display_name: '# trials'
|
|
19
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
20
|
+
- name: estimated_num_tokens_cost
|
|
21
|
+
display_name: 'cost'
|
|
22
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
23
|
+
- name: num_prompt_tokens
|
|
24
|
+
display_name: '# prompt tokens'
|
|
25
|
+
description: Number of tokens in the prompt.
|
|
26
|
+
- name: num_prompt_characters
|
|
27
|
+
display_name: '# prompt chars'
|
|
28
|
+
description: Number of characters in the prompt.
|
|
29
|
+
- name: num_completion_tokens
|
|
30
|
+
display_name: '# completion tokens'
|
|
31
|
+
description: Actual number of completion tokens (over all completions).
|
|
32
|
+
- name: num_output_tokens
|
|
33
|
+
display_name: '# output tokens'
|
|
34
|
+
description: Actual number of output tokens.
|
|
35
|
+
- name: max_num_output_tokens
|
|
36
|
+
display_name: 'Max output tokens'
|
|
37
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
38
|
+
- name: num_requests
|
|
39
|
+
display_name: '# requests'
|
|
40
|
+
description: Number of distinct API requests.
|
|
41
|
+
- name: num_instances
|
|
42
|
+
display_name: '# eval'
|
|
43
|
+
description: Number of evaluation instances.
|
|
44
|
+
- name: num_train_instances
|
|
45
|
+
display_name: '# train'
|
|
46
|
+
description: Number of training instances (e.g., in-context examples).
|
|
47
|
+
- name: prompt_truncated
|
|
48
|
+
display_name: truncated
|
|
49
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
50
|
+
- name: finish_reason_length
|
|
51
|
+
display_name: finish b/c length
|
|
52
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
53
|
+
- name: finish_reason_stop
|
|
54
|
+
display_name: finish b/c stop
|
|
55
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
56
|
+
- name: finish_reason_endoftext
|
|
57
|
+
display_name: finish b/c endoftext
|
|
58
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
59
|
+
- name: finish_reason_unknown
|
|
60
|
+
display_name: finish b/c unknown
|
|
61
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
62
|
+
- name: num_completions
|
|
63
|
+
display_name: '# completions'
|
|
64
|
+
description: Number of completions.
|
|
65
|
+
- name: predicted_index
|
|
66
|
+
display_name: Predicted index
|
|
67
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
68
|
+
- name: inference_runtime
|
|
69
|
+
display_name: Observed inference runtime (s)
|
|
70
|
+
short_display_name: Observed inference time (s)
|
|
71
|
+
lower_is_better: true
|
|
72
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
73
|
+
|
|
74
|
+
# Accuracy metrics:
|
|
75
|
+
- name: exact_match
|
|
76
|
+
display_name: Exact match
|
|
77
|
+
short_display_name: EM
|
|
78
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
79
|
+
lower_is_better: false
|
|
80
|
+
- name: quasi_exact_match
|
|
81
|
+
display_name: Quasi-exact match
|
|
82
|
+
short_display_name: EM
|
|
83
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
- name: prefix_exact_match
|
|
86
|
+
display_name: Prefix exact match
|
|
87
|
+
short_display_name: PEM
|
|
88
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
89
|
+
lower_is_better: false
|
|
90
|
+
- name: quasi_prefix_exact_match
|
|
91
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
92
|
+
display_name: Prefix quasi-exact match
|
|
93
|
+
short_display_name: PEM
|
|
94
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
95
|
+
lower_is_better: false
|
|
96
|
+
- name: ifeval_strict_accuracy
|
|
97
|
+
display_name: IFEval strict accuracy
|
|
98
|
+
short_display_name: IFEval Strict Acc
|
|
99
|
+
description: Fraction of instructions in the instance that are correctly followed.
|
|
100
|
+
lower_is_better: false
|
|
101
|
+
- name: chain_of_thought_correctness
|
|
102
|
+
display_name: COT correct
|
|
103
|
+
short_display_name: COT correct
|
|
104
|
+
description: Fraction of correct answers after chain of thought
|
|
105
|
+
lower_is_better: false
|
|
106
|
+
- name: wildbench_score
|
|
107
|
+
display_name: WildBench Score
|
|
108
|
+
short_display_name: WB Score
|
|
109
|
+
description: Score of the AI output judged by GPT-4o.
|
|
110
|
+
lower_is_better: false
|
|
111
|
+
- name: wildbench_score_rescaled
|
|
112
|
+
display_name: WildBench Score
|
|
113
|
+
short_display_name: WB Score
|
|
114
|
+
description: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.
|
|
115
|
+
lower_is_better: false
|
|
116
|
+
- name: omni_math_accuracy
|
|
117
|
+
display_name: Omni-MATH Accuracy
|
|
118
|
+
short_display_name: Acc
|
|
119
|
+
description: Accuracy of the AI output judged by GPT-4.
|
|
120
|
+
lower_is_better: false
|
|
121
|
+
|
|
122
|
+
############################################################
|
|
123
|
+
perturbations: []
|
|
124
|
+
|
|
125
|
+
############################################################
|
|
126
|
+
metric_groups:
|
|
127
|
+
- name: accuracy
|
|
128
|
+
display_name: Accuracy
|
|
129
|
+
aggregation_strategies:
|
|
130
|
+
- mean
|
|
131
|
+
metrics:
|
|
132
|
+
- name: ${main_name}
|
|
133
|
+
split: ${main_split}
|
|
134
|
+
|
|
135
|
+
- name: efficiency
|
|
136
|
+
display_name: Efficiency
|
|
137
|
+
aggregation_strategies:
|
|
138
|
+
- mean
|
|
139
|
+
metrics:
|
|
140
|
+
- name: inference_runtime
|
|
141
|
+
split: ${main_split}
|
|
142
|
+
|
|
143
|
+
- name: general_information
|
|
144
|
+
display_name: General information
|
|
145
|
+
hide_win_rates: true
|
|
146
|
+
metrics:
|
|
147
|
+
- name: num_instances
|
|
148
|
+
split: ${main_split}
|
|
149
|
+
- name: num_train_instances
|
|
150
|
+
split: ${main_split}
|
|
151
|
+
- name: prompt_truncated
|
|
152
|
+
split: ${main_split}
|
|
153
|
+
- name: num_prompt_tokens
|
|
154
|
+
split: ${main_split}
|
|
155
|
+
- name: num_output_tokens
|
|
156
|
+
split: ${main_split}
|
|
157
|
+
|
|
158
|
+
############################################################
|
|
159
|
+
run_groups:
|
|
160
|
+
- name: core_scenarios
|
|
161
|
+
display_name: Core Scenarios
|
|
162
|
+
description: Core Scenarios
|
|
163
|
+
category: All scenarios
|
|
164
|
+
subgroups:
|
|
165
|
+
- mmlu_pro
|
|
166
|
+
- gpqa
|
|
167
|
+
- ifeval
|
|
168
|
+
- wildbench
|
|
169
|
+
- omni_math
|
|
170
|
+
|
|
171
|
+
- name: mmlu_pro
|
|
172
|
+
display_name: MMLU-Pro
|
|
173
|
+
description: MMLU-Pro
|
|
174
|
+
metric_groups:
|
|
175
|
+
- accuracy
|
|
176
|
+
- efficiency
|
|
177
|
+
- general_information
|
|
178
|
+
environment:
|
|
179
|
+
main_name: chain_of_thought_correctness
|
|
180
|
+
main_split: test
|
|
181
|
+
taxonomy:
|
|
182
|
+
task: "question answering"
|
|
183
|
+
what: "complex questions across various disciplines"
|
|
184
|
+
who: "2024"
|
|
185
|
+
when: "domain experts"
|
|
186
|
+
language: English
|
|
187
|
+
|
|
188
|
+
- name: gpqa
|
|
189
|
+
display_name: GPQA
|
|
190
|
+
description: GPQA
|
|
191
|
+
metric_groups:
|
|
192
|
+
- accuracy
|
|
193
|
+
- efficiency
|
|
194
|
+
- general_information
|
|
195
|
+
environment:
|
|
196
|
+
main_name: chain_of_thought_correctness
|
|
197
|
+
main_split: test
|
|
198
|
+
taxonomy:
|
|
199
|
+
task: "question answering"
|
|
200
|
+
what: "graduate-level questions in biology, physics, and chemistry"
|
|
201
|
+
who: "domain experts"
|
|
202
|
+
when: "2023"
|
|
203
|
+
language: English
|
|
204
|
+
|
|
205
|
+
- name: ifeval
|
|
206
|
+
display_name: IFEval
|
|
207
|
+
description: IFEval
|
|
208
|
+
metric_groups:
|
|
209
|
+
- accuracy
|
|
210
|
+
- efficiency
|
|
211
|
+
- general_information
|
|
212
|
+
environment:
|
|
213
|
+
main_name: ifeval_strict_accuracy
|
|
214
|
+
main_split: test
|
|
215
|
+
taxonomy:
|
|
216
|
+
task: "instruction following"
|
|
217
|
+
what: "verifiable general domain instruction following"
|
|
218
|
+
who: "human annotators"
|
|
219
|
+
when: "2023"
|
|
220
|
+
language: English
|
|
221
|
+
|
|
222
|
+
- name: wildbench
|
|
223
|
+
display_name: WildBench
|
|
224
|
+
description: WildBench
|
|
225
|
+
metric_groups:
|
|
226
|
+
- accuracy
|
|
227
|
+
- efficiency
|
|
228
|
+
- general_information
|
|
229
|
+
environment:
|
|
230
|
+
main_name: wildbench_score_rescaled
|
|
231
|
+
main_split: test
|
|
232
|
+
taxonomy:
|
|
233
|
+
task: "instruction following"
|
|
234
|
+
what: "GPT-judged instruction following with instructions collected from real-user conversations"
|
|
235
|
+
who: "real-world users"
|
|
236
|
+
when: "2024"
|
|
237
|
+
language: English
|
|
238
|
+
|
|
239
|
+
- name: omni_math
|
|
240
|
+
display_name: Omni-MATH
|
|
241
|
+
description: Omni-MATH
|
|
242
|
+
metric_groups:
|
|
243
|
+
- accuracy
|
|
244
|
+
- efficiency
|
|
245
|
+
- general_information
|
|
246
|
+
environment:
|
|
247
|
+
main_name: omni_math_accuracy
|
|
248
|
+
main_split: test
|
|
249
|
+
taxonomy:
|
|
250
|
+
task: "mathematics"
|
|
251
|
+
what: "universal Olympiad level mathematic benchmark"
|
|
252
|
+
who: "human annotators"
|
|
253
|
+
when: "2024"
|
|
254
|
+
language: English
|