crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -76,7 +76,7 @@ metrics:
|
|
|
76
76
|
lower_is_better: false
|
|
77
77
|
- name: safety_score
|
|
78
78
|
display_name: LM Evaluated Safety score
|
|
79
|
-
description: LM Evaluated Safety score (mean of
|
|
79
|
+
description: LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).
|
|
80
80
|
lower_is_better: false
|
|
81
81
|
- name: bbq_accuracy
|
|
82
82
|
display_name: BBQ accuracy
|
|
@@ -98,6 +98,13 @@ metrics:
|
|
|
98
98
|
lower_is_better: true
|
|
99
99
|
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
100
100
|
|
|
101
|
+
# Annotation metrics
|
|
102
|
+
- name: safety_llama_annotator_success
|
|
103
|
+
display_name: Llama Annotator Success Rate
|
|
104
|
+
description: Fraction of annotator requests to Llama that succeeded.
|
|
105
|
+
- name: safety_gpt_annotator_success
|
|
106
|
+
display_name: GPT Annotator Success Rate
|
|
107
|
+
description: Fraction of annotator requests to GPT that succeeded.
|
|
101
108
|
|
|
102
109
|
############################################################
|
|
103
110
|
perturbations: []
|
|
@@ -106,6 +113,8 @@ perturbations: []
|
|
|
106
113
|
metric_groups:
|
|
107
114
|
- name: accuracy
|
|
108
115
|
display_name: Accuracy
|
|
116
|
+
aggregation_strategies:
|
|
117
|
+
- mean
|
|
109
118
|
metrics:
|
|
110
119
|
- name: ${main_name}
|
|
111
120
|
split: ${main_split}
|
|
@@ -131,6 +140,15 @@ metric_groups:
|
|
|
131
140
|
- name: num_output_tokens
|
|
132
141
|
split: ${main_split}
|
|
133
142
|
|
|
143
|
+
- name: annotation_metrics
|
|
144
|
+
display_name: Annotation
|
|
145
|
+
hide_win_rates: true
|
|
146
|
+
metrics:
|
|
147
|
+
- name: safety_llama_annotator_success
|
|
148
|
+
split: ${main_split}
|
|
149
|
+
- name: safety_gpt_annotator_success
|
|
150
|
+
split: ${main_split}
|
|
151
|
+
|
|
134
152
|
- name: toxicity
|
|
135
153
|
display_name: Toxicity
|
|
136
154
|
metrics:
|
|
@@ -153,6 +171,7 @@ run_groups:
|
|
|
153
171
|
description: Scenarios for the model safety
|
|
154
172
|
category: All scenarios
|
|
155
173
|
subgroups:
|
|
174
|
+
# - harm_bench_gcg_transfer
|
|
156
175
|
- harm_bench
|
|
157
176
|
- simple_safety_tests
|
|
158
177
|
- bbq
|
|
@@ -164,8 +183,25 @@ run_groups:
|
|
|
164
183
|
description: HarmBench
|
|
165
184
|
metric_groups:
|
|
166
185
|
- accuracy
|
|
167
|
-
- efficiency
|
|
168
186
|
- general_information
|
|
187
|
+
- annotation_metrics
|
|
188
|
+
environment:
|
|
189
|
+
main_name: safety_score
|
|
190
|
+
main_split: test
|
|
191
|
+
taxonomy:
|
|
192
|
+
task: question answering
|
|
193
|
+
what: n/a
|
|
194
|
+
who: n/a
|
|
195
|
+
when: n/a
|
|
196
|
+
language: English
|
|
197
|
+
|
|
198
|
+
- name: harm_bench_gcg_transfer
|
|
199
|
+
display_name: HarmBenchGCGTransfer
|
|
200
|
+
description: HarmBenchGCGTransfer
|
|
201
|
+
metric_groups:
|
|
202
|
+
- accuracy
|
|
203
|
+
- general_information
|
|
204
|
+
- annotation_metrics
|
|
169
205
|
environment:
|
|
170
206
|
main_name: safety_score
|
|
171
207
|
main_split: test
|
|
@@ -181,8 +217,8 @@ run_groups:
|
|
|
181
217
|
description: SimpleSafetyTests
|
|
182
218
|
metric_groups:
|
|
183
219
|
- accuracy
|
|
184
|
-
- efficiency
|
|
185
220
|
- general_information
|
|
221
|
+
- annotation_metrics
|
|
186
222
|
environment:
|
|
187
223
|
main_name: safety_score
|
|
188
224
|
main_split: test
|
|
@@ -198,8 +234,8 @@ run_groups:
|
|
|
198
234
|
description: XSTest
|
|
199
235
|
metric_groups:
|
|
200
236
|
- accuracy
|
|
201
|
-
- efficiency
|
|
202
237
|
- general_information
|
|
238
|
+
- annotation_metrics
|
|
203
239
|
environment:
|
|
204
240
|
main_name: safety_score
|
|
205
241
|
main_split: test
|
|
@@ -215,9 +251,9 @@ run_groups:
|
|
|
215
251
|
description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
|
|
216
252
|
metric_groups:
|
|
217
253
|
- accuracy
|
|
218
|
-
- efficiency
|
|
219
254
|
- general_information
|
|
220
255
|
- bbq_metrics
|
|
256
|
+
- annotation_metrics
|
|
221
257
|
environment:
|
|
222
258
|
main_name: bbq_accuracy
|
|
223
259
|
main_split: test
|
|
@@ -234,8 +270,8 @@ run_groups:
|
|
|
234
270
|
description: Anthropic Red Team
|
|
235
271
|
metric_groups:
|
|
236
272
|
- accuracy
|
|
237
|
-
- efficiency
|
|
238
273
|
- general_information
|
|
274
|
+
- annotation_metrics
|
|
239
275
|
environment:
|
|
240
276
|
main_name: safety_score
|
|
241
277
|
main_split: test
|
|
@@ -69,15 +69,25 @@ metrics:
|
|
|
69
69
|
- name: chr_f_plus_plus
|
|
70
70
|
display_name: ChrF++
|
|
71
71
|
description: Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, 2015)](https://aclanthology.org/W15-3049/). Code can be found [here](https://github.com/mjpost/sacrebleu).
|
|
72
|
+
lower_is_better: false
|
|
72
73
|
- name: squad_exact_match_score
|
|
73
74
|
display_name: SQuAD exact match
|
|
74
75
|
description: SQuAD exact match score [(Rajpurkar, 2016)](https://aclanthology.org/D16-1264). Code can be found [here](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
|
76
|
+
lower_is_better: false
|
|
75
77
|
- name: squad_f1_score
|
|
76
78
|
display_name: SQuAD macro-averaged F1 score
|
|
77
79
|
description: SQuAD macro-averaged F1 score [(Rajpurkar, 2016)](https://aclanthology.org/D16-1264). Code can be found [here](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
|
80
|
+
lower_is_better: false
|
|
78
81
|
- name: classification_macro_f1
|
|
79
82
|
display_name: Macro F1 score
|
|
80
83
|
description: Macro F1 score
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
- name: inference_runtime
|
|
86
|
+
display_name: Observed inference runtime (s)
|
|
87
|
+
short_display_name: Observed inference time (s)
|
|
88
|
+
lower_is_better: true
|
|
89
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
90
|
+
|
|
81
91
|
|
|
82
92
|
############################################################
|
|
83
93
|
perturbations: []
|
|
@@ -86,12 +96,16 @@ perturbations: []
|
|
|
86
96
|
metric_groups:
|
|
87
97
|
- name: accuracy
|
|
88
98
|
display_name: Accuracy
|
|
99
|
+
aggregation_strategies:
|
|
100
|
+
- mean
|
|
89
101
|
metrics:
|
|
90
102
|
- name: ${main_name}
|
|
91
103
|
split: ${main_split}
|
|
92
104
|
|
|
93
105
|
- name: efficiency
|
|
94
106
|
display_name: Efficiency
|
|
107
|
+
aggregation_strategies:
|
|
108
|
+
- mean
|
|
95
109
|
metrics:
|
|
96
110
|
- name: inference_runtime
|
|
97
111
|
split: ${main_split}
|
|
@@ -113,10 +127,10 @@ metric_groups:
|
|
|
113
127
|
############################################################
|
|
114
128
|
|
|
115
129
|
run_groups:
|
|
116
|
-
- name:
|
|
117
|
-
display_name:
|
|
118
|
-
description:
|
|
119
|
-
category:
|
|
130
|
+
- name: seahelm_nlu
|
|
131
|
+
display_name: SEA-HELM natural language understanding (NLU)
|
|
132
|
+
description: SEA-HELM natural language understanding (NLU) scenarios
|
|
133
|
+
category: SEA-HELM scenarios
|
|
120
134
|
subgroups:
|
|
121
135
|
- tydiqa
|
|
122
136
|
- xquad_vi
|
|
@@ -130,10 +144,10 @@ run_groups:
|
|
|
130
144
|
- vihsd
|
|
131
145
|
- thaitoxicitytweets
|
|
132
146
|
|
|
133
|
-
- name:
|
|
134
|
-
display_name:
|
|
135
|
-
description:
|
|
136
|
-
category:
|
|
147
|
+
- name: seahelm_nlg
|
|
148
|
+
display_name: SEA-HELM natural language generation (NLG)
|
|
149
|
+
description: SEA-HELM natural language generation (NLG) scenarios
|
|
150
|
+
category: SEA-HELM scenarios
|
|
137
151
|
subgroups:
|
|
138
152
|
- flores_id_en
|
|
139
153
|
- flores_vi_en
|
|
@@ -144,10 +158,10 @@ run_groups:
|
|
|
144
158
|
- flores_en_th
|
|
145
159
|
- flores_en_ta
|
|
146
160
|
|
|
147
|
-
- name:
|
|
148
|
-
display_name:
|
|
149
|
-
description:
|
|
150
|
-
category:
|
|
161
|
+
- name: seahelm_nlr
|
|
162
|
+
display_name: SEA-HELM natural language reasoning (NLR)
|
|
163
|
+
description: SEA-HELM natural language reasoning (NLR) scenarios
|
|
164
|
+
category: SEA-HELM scenarios
|
|
151
165
|
subgroups:
|
|
152
166
|
- indonli
|
|
153
167
|
- xnli_vi
|
|
@@ -158,14 +172,14 @@ run_groups:
|
|
|
158
172
|
- xcopa_th
|
|
159
173
|
- xcopa_ta
|
|
160
174
|
|
|
161
|
-
- name:
|
|
162
|
-
display_name:
|
|
163
|
-
description:
|
|
164
|
-
category:
|
|
175
|
+
- name: seahelm_lindsea
|
|
176
|
+
display_name: SEA-HELM Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
|
|
177
|
+
description: SEA-HELM Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
|
|
178
|
+
category: SEA-HELM scenarios
|
|
165
179
|
subgroups:
|
|
166
180
|
- lindsea_syntax_minimal_pairs_id
|
|
167
|
-
-
|
|
168
|
-
-
|
|
181
|
+
- lindsea_pragmatics_presuppositions_id
|
|
182
|
+
- lindsea_pragmatics_scalar_implicatures_id
|
|
169
183
|
|
|
170
184
|
- name: tydiqa
|
|
171
185
|
display_name: TyDiQA
|
|
@@ -672,10 +686,10 @@ run_groups:
|
|
|
672
686
|
when: "?"
|
|
673
687
|
language: Indonesian
|
|
674
688
|
|
|
675
|
-
- name:
|
|
676
|
-
display_name: LINDSEA Pragmatics
|
|
689
|
+
- name: lindsea_pragmatics_presuppositions_id
|
|
690
|
+
display_name: LINDSEA Pragmatics Presuppositions
|
|
677
691
|
description: >
|
|
678
|
-
LINDSEA
|
|
692
|
+
LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
|
|
679
693
|
metric_groups:
|
|
680
694
|
- accuracy
|
|
681
695
|
- efficiency
|
|
@@ -685,15 +699,15 @@ run_groups:
|
|
|
685
699
|
main_split: test
|
|
686
700
|
taxonomy:
|
|
687
701
|
task: pragmatic reasoning
|
|
688
|
-
what:
|
|
702
|
+
what: presuppositions
|
|
689
703
|
who: "?"
|
|
690
704
|
when: "?"
|
|
691
705
|
language: Indonesian
|
|
692
706
|
|
|
693
|
-
- name:
|
|
694
|
-
display_name: LINDSEA Pragmatics
|
|
707
|
+
- name: lindsea_pragmatics_scalar_implicatures_id
|
|
708
|
+
display_name: LINDSEA Pragmatics Scalar Implicatures
|
|
695
709
|
description: >
|
|
696
|
-
LINDSEA
|
|
710
|
+
LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
|
|
697
711
|
metric_groups:
|
|
698
712
|
- accuracy
|
|
699
713
|
- efficiency
|
|
@@ -703,7 +717,7 @@ run_groups:
|
|
|
703
717
|
main_split: test
|
|
704
718
|
taxonomy:
|
|
705
719
|
task: pragmatic reasoning
|
|
706
|
-
what: scalar implicatures
|
|
720
|
+
what: scalar implicatures
|
|
707
721
|
who: "?"
|
|
708
722
|
when: "?"
|
|
709
723
|
language: Indonesian
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: quasi_exact_match
|
|
74
|
+
display_name: Quasi-exact match
|
|
75
|
+
short_display_name: EM
|
|
76
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
77
|
+
lower_is_better: false
|
|
78
|
+
- name: prefix_exact_match
|
|
79
|
+
display_name: Prefix exact match
|
|
80
|
+
short_display_name: PEM
|
|
81
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
82
|
+
lower_is_better: false
|
|
83
|
+
- name: quasi_prefix_exact_match
|
|
84
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
85
|
+
display_name: Prefix quasi-exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
|
|
90
|
+
- name: rouge_1
|
|
91
|
+
display_name: ROUGE-1
|
|
92
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
93
|
+
lower_is_better: false
|
|
94
|
+
- name: rouge_2
|
|
95
|
+
display_name: ROUGE-2
|
|
96
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
97
|
+
lower_is_better: false
|
|
98
|
+
- name: rouge_l
|
|
99
|
+
display_name: ROUGE-L
|
|
100
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
101
|
+
lower_is_better: false
|
|
102
|
+
- name: bleu_1
|
|
103
|
+
display_name: BLEU-1
|
|
104
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
105
|
+
lower_is_better: false
|
|
106
|
+
- name: bleu_4
|
|
107
|
+
display_name: BLEU-4
|
|
108
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
109
|
+
lower_is_better: false
|
|
110
|
+
- name: f1_score
|
|
111
|
+
display_name: F1
|
|
112
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
113
|
+
lower_is_better: false
|
|
114
|
+
- name: cider
|
|
115
|
+
display_name: CIDEr
|
|
116
|
+
description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
|
|
117
|
+
lower_is_better: false
|
|
118
|
+
|
|
119
|
+
# Machine Translation metrics
|
|
120
|
+
- name: bleu
|
|
121
|
+
display_name: BLEU
|
|
122
|
+
short_display_name: BLEU
|
|
123
|
+
description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
|
|
124
|
+
lower_is_better: false
|
|
125
|
+
|
|
126
|
+
############################################################
|
|
127
|
+
perturbations: []
|
|
128
|
+
|
|
129
|
+
############################################################
|
|
130
|
+
metric_groups:
|
|
131
|
+
- name: accuracy
|
|
132
|
+
display_name: Accuracy
|
|
133
|
+
hide_win_rates: true
|
|
134
|
+
metrics:
|
|
135
|
+
- name: ${main_name}
|
|
136
|
+
split: ${main_split}
|
|
137
|
+
|
|
138
|
+
- name: efficiency
|
|
139
|
+
display_name: Efficiency
|
|
140
|
+
metrics:
|
|
141
|
+
- name: inference_runtime
|
|
142
|
+
split: ${main_split}
|
|
143
|
+
|
|
144
|
+
- name: general_information
|
|
145
|
+
display_name: General information
|
|
146
|
+
hide_win_rates: true
|
|
147
|
+
metrics:
|
|
148
|
+
- name: num_instances
|
|
149
|
+
split: ${main_split}
|
|
150
|
+
- name: num_train_instances
|
|
151
|
+
split: ${main_split}
|
|
152
|
+
- name: prompt_truncated
|
|
153
|
+
split: ${main_split}
|
|
154
|
+
- name: num_prompt_tokens
|
|
155
|
+
split: ${main_split}
|
|
156
|
+
- name: num_output_tokens
|
|
157
|
+
split: ${main_split}
|
|
158
|
+
|
|
159
|
+
############################################################
|
|
160
|
+
|
|
161
|
+
run_groups:
|
|
162
|
+
- name: audio_scenarios
|
|
163
|
+
display_name: Audio Scenarios
|
|
164
|
+
description: Audio Scenarios
|
|
165
|
+
category: All scenarios
|
|
166
|
+
subgroups:
|
|
167
|
+
- audio_mnist
|
|
168
|
+
- meld_audio
|
|
169
|
+
- iemocap_audio
|
|
170
|
+
|
|
171
|
+
- name: audio_mnist
|
|
172
|
+
display_name: AudioMNIST
|
|
173
|
+
description: >
|
|
174
|
+
The AudioMNIST dataset consists of a dataset of 30000 audio samples of
|
|
175
|
+
spoken digits (0-9) of 60 different speakers. The task is to classify the digit from the
|
|
176
|
+
audio sample ([Becker et al, 2023](https://arxiv.org/abs/1807.03418)).
|
|
177
|
+
metric_groups:
|
|
178
|
+
- accuracy
|
|
179
|
+
- efficiency
|
|
180
|
+
- general_information
|
|
181
|
+
environment:
|
|
182
|
+
main_name: exact_match
|
|
183
|
+
main_split: test
|
|
184
|
+
taxonomy:
|
|
185
|
+
task: audio classification
|
|
186
|
+
what: audio samples of spoken digits (0-9)
|
|
187
|
+
who: 60 different speakers
|
|
188
|
+
when: "2018"
|
|
189
|
+
language: English
|
|
190
|
+
|
|
191
|
+
- name: meld_audio
|
|
192
|
+
display_name: MELD Audio
|
|
193
|
+
description: Classify emotions in audio clips from the television series Friends ([Poria et al, 2018](https://arxiv.org/abs/1810.02508)).
|
|
194
|
+
metric_groups:
|
|
195
|
+
- accuracy
|
|
196
|
+
- efficiency
|
|
197
|
+
- general_information
|
|
198
|
+
environment:
|
|
199
|
+
main_name: quasi_exact_match
|
|
200
|
+
main_split: test
|
|
201
|
+
taxonomy:
|
|
202
|
+
task: audio classification
|
|
203
|
+
what: audio clips from the television series Friends
|
|
204
|
+
who: television actors
|
|
205
|
+
when: "1994-2004"
|
|
206
|
+
language: English
|
|
207
|
+
|
|
208
|
+
- name: iemocap_audio
|
|
209
|
+
display_name: IEMOCAP Audio
|
|
210
|
+
description: >
|
|
211
|
+
A classification scenario based on audio data from the Interactive emotional dyadic motion capture database (IEMOCAP) ([Busso et al, 2008](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf)). The task is to classify the emotion of the speaker(s) in the audio sample.
|
|
212
|
+
metric_groups:
|
|
213
|
+
- accuracy
|
|
214
|
+
- efficiency
|
|
215
|
+
- general_information
|
|
216
|
+
environment:
|
|
217
|
+
main_name: quasi_exact_match
|
|
218
|
+
main_split: test
|
|
219
|
+
taxonomy:
|
|
220
|
+
task: audio classification
|
|
221
|
+
what: audio recordings improvisations or scripted scenarios
|
|
222
|
+
who: 10 professional actors
|
|
223
|
+
when: "2007"
|
|
224
|
+
language: English
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: program_accuracy
|
|
69
|
+
display_name: Program Accuracy
|
|
70
|
+
description: Accuracy of the generated programs
|
|
71
|
+
lower_is_better: false
|
|
72
|
+
- name: execution_accuracy
|
|
73
|
+
display_name: Execution Accuracy
|
|
74
|
+
description: Accuracy of the final result of the generated program
|
|
75
|
+
lower_is_better: false
|
|
76
|
+
- name: annotation_financebench_label_correct_answer
|
|
77
|
+
display_name: Correct Answer
|
|
78
|
+
description: Whether the final result was correct, as judged by a GPT-4o
|
|
79
|
+
lower_is_better: false
|
|
80
|
+
- name: quasi_exact_match
|
|
81
|
+
display_name: Quasi-exact match
|
|
82
|
+
short_display_name: EM
|
|
83
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
- name: error_rate
|
|
86
|
+
display_name: SQL Error Rate
|
|
87
|
+
short_display_name: SQL Error Rate
|
|
88
|
+
description: Fraction of generated queries that result in a SQL execution error
|
|
89
|
+
lower_is_better: true
|
|
90
|
+
- name: execution_accuracy
|
|
91
|
+
display_name: Execution Accuracy
|
|
92
|
+
short_display_name: Execution Accuracy
|
|
93
|
+
description: Execution Accuracy
|
|
94
|
+
lower_is_better: false
|
|
95
|
+
|
|
96
|
+
############################################################
|
|
97
|
+
perturbations: []
|
|
98
|
+
|
|
99
|
+
############################################################
|
|
100
|
+
metric_groups:
|
|
101
|
+
- name: accuracy
|
|
102
|
+
display_name: Accuracy
|
|
103
|
+
hide_win_rates: true
|
|
104
|
+
metrics:
|
|
105
|
+
- name: ${main_name}
|
|
106
|
+
split: ${main_split}
|
|
107
|
+
|
|
108
|
+
- name: efficiency
|
|
109
|
+
display_name: Efficiency
|
|
110
|
+
metrics:
|
|
111
|
+
- name: inference_runtime
|
|
112
|
+
split: ${main_split}
|
|
113
|
+
|
|
114
|
+
- name: general_information
|
|
115
|
+
display_name: General information
|
|
116
|
+
hide_win_rates: true
|
|
117
|
+
metrics:
|
|
118
|
+
- name: num_instances
|
|
119
|
+
split: ${main_split}
|
|
120
|
+
- name: num_train_instances
|
|
121
|
+
split: ${main_split}
|
|
122
|
+
- name: prompt_truncated
|
|
123
|
+
split: ${main_split}
|
|
124
|
+
- name: num_prompt_tokens
|
|
125
|
+
split: ${main_split}
|
|
126
|
+
- name: num_output_tokens
|
|
127
|
+
split: ${main_split}
|
|
128
|
+
|
|
129
|
+
############################################################
|
|
130
|
+
run_groups:
|
|
131
|
+
- name: text_to_sql_scenarios
|
|
132
|
+
display_name: Text-to-SQL Scenarios
|
|
133
|
+
description: Text-to-SQL Scenarios
|
|
134
|
+
category: All scenarios
|
|
135
|
+
subgroups:
|
|
136
|
+
- spider
|
|
137
|
+
- bird_sql
|
|
138
|
+
|
|
139
|
+
- name: spider
|
|
140
|
+
display_name: Spider 1.0 (Test)
|
|
141
|
+
description: Spider 1.0 (Test)
|
|
142
|
+
metric_groups:
|
|
143
|
+
- accuracy
|
|
144
|
+
- efficiency
|
|
145
|
+
- general_information
|
|
146
|
+
environment:
|
|
147
|
+
main_name: execution_accuracy
|
|
148
|
+
main_split: valid
|
|
149
|
+
taxonomy:
|
|
150
|
+
task: text-to-SQL
|
|
151
|
+
what: databases from various domains
|
|
152
|
+
who: expert data scientists
|
|
153
|
+
when: "?"
|
|
154
|
+
language: English
|
|
155
|
+
|
|
156
|
+
- name: bird_sql
|
|
157
|
+
display_name: BIRD-SQL (Dev)
|
|
158
|
+
description: BIRD-SQL (Dev)
|
|
159
|
+
metric_groups:
|
|
160
|
+
- accuracy
|
|
161
|
+
- efficiency
|
|
162
|
+
- general_information
|
|
163
|
+
environment:
|
|
164
|
+
main_name: execution_accuracy
|
|
165
|
+
main_split: valid
|
|
166
|
+
taxonomy:
|
|
167
|
+
task: text-to-SQL
|
|
168
|
+
what: databases from various domains
|
|
169
|
+
who: expert data scientists
|
|
170
|
+
when: "?"
|
|
171
|
+
language: English
|