crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,1257 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: quasi_exact_match
|
|
74
|
+
display_name: Quasi-exact match
|
|
75
|
+
short_display_name: EM
|
|
76
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
77
|
+
lower_is_better: false
|
|
78
|
+
- name: prefix_exact_match
|
|
79
|
+
display_name: Prefix exact match
|
|
80
|
+
short_display_name: PEM
|
|
81
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
82
|
+
lower_is_better: false
|
|
83
|
+
- name: quasi_prefix_exact_match
|
|
84
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
85
|
+
display_name: Prefix quasi-exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
|
|
90
|
+
- name: exact_match@5
|
|
91
|
+
display_name: Exact match @5
|
|
92
|
+
short_display_name: EM@5
|
|
93
|
+
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
|
|
94
|
+
lower_is_better: false
|
|
95
|
+
- name: quasi_exact_match@5
|
|
96
|
+
display_name: Quasi-exact match @5
|
|
97
|
+
short_display_name: EM@5
|
|
98
|
+
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
|
|
99
|
+
lower_is_better: false
|
|
100
|
+
- name: prefix_exact_match@5
|
|
101
|
+
display_name: Prefix exact match @5
|
|
102
|
+
short_display_name: PEM@5
|
|
103
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
|
|
104
|
+
lower_is_better: false
|
|
105
|
+
- name: quasi_prefix_exact_match@5
|
|
106
|
+
display_name: Prefix quasi-exact match @5
|
|
107
|
+
short_display_name: PEM@5
|
|
108
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
|
|
109
|
+
lower_is_better: false
|
|
110
|
+
|
|
111
|
+
- name: logprob
|
|
112
|
+
display_name: Log probability
|
|
113
|
+
short_display_name: Logprob
|
|
114
|
+
description: Predicted output's average log probability (input's log prob for language modeling).
|
|
115
|
+
lower_is_better: false
|
|
116
|
+
- name: logprob_per_byte
|
|
117
|
+
display_name: Log probability / byte
|
|
118
|
+
short_display_name: Logprob/byte
|
|
119
|
+
description: Predicted output's average log probability normalized by the number of bytes.
|
|
120
|
+
lower_is_better: false
|
|
121
|
+
- name: bits_per_byte
|
|
122
|
+
display_name: Bits/byte
|
|
123
|
+
short_display_name: BPB
|
|
124
|
+
lower_is_better: true
|
|
125
|
+
description: Average number of bits per byte according to model probabilities.
|
|
126
|
+
- name: perplexity
|
|
127
|
+
display_name: Perplexity
|
|
128
|
+
short_display_name: PPL
|
|
129
|
+
lower_is_better: true
|
|
130
|
+
description: Perplexity of the output completion (effective branching factor per output token).
|
|
131
|
+
- name: rouge_1
|
|
132
|
+
display_name: ROUGE-1
|
|
133
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
134
|
+
lower_is_better: false
|
|
135
|
+
- name: rouge_2
|
|
136
|
+
display_name: ROUGE-2
|
|
137
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
138
|
+
lower_is_better: false
|
|
139
|
+
- name: rouge_l
|
|
140
|
+
display_name: ROUGE-L
|
|
141
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
142
|
+
lower_is_better: false
|
|
143
|
+
- name: bleu_1
|
|
144
|
+
display_name: BLEU-1
|
|
145
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
146
|
+
lower_is_better: false
|
|
147
|
+
- name: bleu_4
|
|
148
|
+
display_name: BLEU-4
|
|
149
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
150
|
+
lower_is_better: false
|
|
151
|
+
- name: f1_set_match
|
|
152
|
+
display_name: F1 (set match)
|
|
153
|
+
short_display_name: F1
|
|
154
|
+
description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
|
|
155
|
+
lower_is_better: false
|
|
156
|
+
- name: f1_score
|
|
157
|
+
display_name: F1
|
|
158
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
159
|
+
lower_is_better: false
|
|
160
|
+
- name: f1_score@5
|
|
161
|
+
display_name: F1@5
|
|
162
|
+
description: Average F1 score at top 5 in terms of word overlap between the model output and correct reference.
|
|
163
|
+
lower_is_better: false
|
|
164
|
+
- name: classification_macro_f1
|
|
165
|
+
display_name: Macro-F1
|
|
166
|
+
description: Population-level macro-averaged F1 score.
|
|
167
|
+
lower_is_better: false
|
|
168
|
+
- name: classification_micro_f1
|
|
169
|
+
display_name: Micro-F1
|
|
170
|
+
description: Population-level micro-averaged F1 score.
|
|
171
|
+
lower_is_better: false
|
|
172
|
+
- name: absolute_value_difference
|
|
173
|
+
display_name: Absolute difference
|
|
174
|
+
short_display_name: Diff.
|
|
175
|
+
lower_is_better: true
|
|
176
|
+
description: Average absolute difference between the model output (converted to a number) and the correct reference.
|
|
177
|
+
- name: distance
|
|
178
|
+
display_name: Geometric distance
|
|
179
|
+
short_display_name: Dist.
|
|
180
|
+
lower_is_better: true
|
|
181
|
+
description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
|
|
182
|
+
- name: percent_valid
|
|
183
|
+
display_name: Valid fraction
|
|
184
|
+
short_display_name: Valid
|
|
185
|
+
description: Fraction of valid model outputs (as a number).
|
|
186
|
+
lower_is_better: false
|
|
187
|
+
- name: RR@5
|
|
188
|
+
display_name: RR@5
|
|
189
|
+
description: Mean reciprocal rank at 5 in information retrieval.
|
|
190
|
+
lower_is_better: false
|
|
191
|
+
- name: NDCG@10
|
|
192
|
+
display_name: NDCG@10
|
|
193
|
+
description: Normalized discounted cumulative gain at 10 in information retrieval.
|
|
194
|
+
lower_is_better: false
|
|
195
|
+
- name: RR@10
|
|
196
|
+
display_name: RR@10
|
|
197
|
+
description: Mean reciprocal rank at 10 in information retrieval.
|
|
198
|
+
lower_is_better: false
|
|
199
|
+
- name: NDCG@20
|
|
200
|
+
display_name: NDCG@20
|
|
201
|
+
description: Normalized discounted cumulative gain at 20 in information retrieval.
|
|
202
|
+
lower_is_better: false
|
|
203
|
+
- name: RR@20
|
|
204
|
+
display_name: RR@20
|
|
205
|
+
description: Mean reciprocal rank at 20 in information retrieval.
|
|
206
|
+
lower_is_better: false
|
|
207
|
+
- name: Success@1
|
|
208
|
+
display_name: Success@1
|
|
209
|
+
description: Success at top 1 in information retrieval.
|
|
210
|
+
lower_is_better: false
|
|
211
|
+
- name: Success@2
|
|
212
|
+
display_name: Success@2
|
|
213
|
+
description: Success at top 2 in information retrieval.
|
|
214
|
+
lower_is_better: false
|
|
215
|
+
- name: Success@3
|
|
216
|
+
display_name: Success@3
|
|
217
|
+
description: Success at top 3 in information retrieval.
|
|
218
|
+
lower_is_better: false
|
|
219
|
+
- name: Success@5
|
|
220
|
+
display_name: Success@5
|
|
221
|
+
description: Success at top 5 in information retrieval.
|
|
222
|
+
lower_is_better: false
|
|
223
|
+
- name: Success@10
|
|
224
|
+
display_name: Success@10
|
|
225
|
+
description: Success at top 10 in information retrieval.
|
|
226
|
+
lower_is_better: false
|
|
227
|
+
- name: Success@20
|
|
228
|
+
display_name: Success@20
|
|
229
|
+
description: Success at top 20 in information retrieval.
|
|
230
|
+
lower_is_better: false
|
|
231
|
+
- name: Recall@1
|
|
232
|
+
display_name: Recall@1
|
|
233
|
+
description: Recall at top 1 in information retrieval.
|
|
234
|
+
lower_is_better: false
|
|
235
|
+
- name: Recall@2
|
|
236
|
+
display_name: Recall@2
|
|
237
|
+
description: Recall at top 2 in information retrieval.
|
|
238
|
+
lower_is_better: false
|
|
239
|
+
- name: Recall@3
|
|
240
|
+
display_name: Recall@3
|
|
241
|
+
description: Recall at top 3 in information retrieval.
|
|
242
|
+
lower_is_better: false
|
|
243
|
+
- name: Recall@5
|
|
244
|
+
display_name: Recall@5
|
|
245
|
+
description: Recall at top 5 in information retrieval.
|
|
246
|
+
lower_is_better: false
|
|
247
|
+
- name: Recall@10
|
|
248
|
+
display_name: Recall@10
|
|
249
|
+
description: Recall at top 10 in information retrieval.
|
|
250
|
+
lower_is_better: false
|
|
251
|
+
- name: Recall@20
|
|
252
|
+
display_name: Recall@20
|
|
253
|
+
description: Recall at top 20 in information retrieval.
|
|
254
|
+
lower_is_better: false
|
|
255
|
+
- name: Success@1 (topk=30)
|
|
256
|
+
display_name: Success@1 (topk=30)
|
|
257
|
+
description: Success at top 1 in information retrieval.
|
|
258
|
+
lower_is_better: false
|
|
259
|
+
- name: Success@2 (topk=30)
|
|
260
|
+
display_name: Success@2 (topk=30)
|
|
261
|
+
description: Success at top 2 in information retrieval.
|
|
262
|
+
lower_is_better: false
|
|
263
|
+
- name: Success@3 (topk=30)
|
|
264
|
+
display_name: Success@3 (topk=30)
|
|
265
|
+
description: Success at top 3 in information retrieval.
|
|
266
|
+
lower_is_better: false
|
|
267
|
+
- name: Success@5 (topk=30)
|
|
268
|
+
display_name: Success@5 (topk=30)
|
|
269
|
+
description: Success at top 5 in information retrieval.
|
|
270
|
+
lower_is_better: false
|
|
271
|
+
- name: Success@10 (topk=30)
|
|
272
|
+
display_name: Success@10 (topk=30)
|
|
273
|
+
description: Success at top 10 in information retrieval.
|
|
274
|
+
lower_is_better: false
|
|
275
|
+
- name: Success@20 (topk=30)
|
|
276
|
+
display_name: Success@20 (topk=30)
|
|
277
|
+
description: Success at top 20 in information retrieval.
|
|
278
|
+
lower_is_better: false
|
|
279
|
+
- name: Recall@1 (topk=30)
|
|
280
|
+
display_name: Recall@1 (topk=30)
|
|
281
|
+
description: Recall at top 1 in information retrieval.
|
|
282
|
+
lower_is_better: false
|
|
283
|
+
- name: Recall@2 (topk=30)
|
|
284
|
+
display_name: Recall@2 (topk=30)
|
|
285
|
+
description: Recall at top 2 in information retrieval.
|
|
286
|
+
lower_is_better: false
|
|
287
|
+
- name: Recall@3 (topk=30)
|
|
288
|
+
display_name: Recall@3 (topk=30)
|
|
289
|
+
description: Recall at top 3 in information retrieval.
|
|
290
|
+
lower_is_better: false
|
|
291
|
+
- name: Recall@5 (topk=30)
|
|
292
|
+
display_name: Recall@5 (topk=30)
|
|
293
|
+
description: Recall at top 5 in information retrieval.
|
|
294
|
+
lower_is_better: false
|
|
295
|
+
- name: Recall@10 (topk=30)
|
|
296
|
+
display_name: Recall@10 (topk=30)
|
|
297
|
+
description: Recall at top 10 in information retrieval.
|
|
298
|
+
lower_is_better: false
|
|
299
|
+
- name: Recall@20 (topk=30)
|
|
300
|
+
display_name: Recall@20 (topk=30)
|
|
301
|
+
description: Recall at top 20 in information retrieval.
|
|
302
|
+
lower_is_better: false
|
|
303
|
+
- name: RR@5 (topk=30)
|
|
304
|
+
display_name: RR@5 (topk=30)
|
|
305
|
+
description: Mean reciprocal rank at 5 in information retrieval.
|
|
306
|
+
lower_is_better: false
|
|
307
|
+
- name: RR@10 (topk=30)
|
|
308
|
+
display_name: RR@10 (topk=30)
|
|
309
|
+
description: Mean reciprocal rank at 10 in information retrieval.
|
|
310
|
+
lower_is_better: false
|
|
311
|
+
- name: RR@20 (topk=30)
|
|
312
|
+
display_name: RR@20 (topk=30)
|
|
313
|
+
description: Mean reciprocal rank at 20 in information retrieval.
|
|
314
|
+
lower_is_better: false
|
|
315
|
+
- name: math_equiv
|
|
316
|
+
display_name: Equivalent
|
|
317
|
+
description: Fraction of model outputs that are mathematically equivalent to the correct reference.
|
|
318
|
+
lower_is_better: false
|
|
319
|
+
- name: math_equiv_chain_of_thought
|
|
320
|
+
display_name: Equivalent (chain of thought)
|
|
321
|
+
description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
|
|
322
|
+
lower_is_better: false
|
|
323
|
+
- name: exact_match_indicator
|
|
324
|
+
display_name: Exact match (final)
|
|
325
|
+
short_display_name: EM
|
|
326
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
|
|
327
|
+
lower_is_better: false
|
|
328
|
+
- name: final_number_exact_match
|
|
329
|
+
display_name: Exact match (final number)
|
|
330
|
+
short_display_name: EM
|
|
331
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
|
|
332
|
+
lower_is_better: false
|
|
333
|
+
- name: exact_set_match
|
|
334
|
+
display_name: Exact match (at sets)
|
|
335
|
+
short_display_name: EM
|
|
336
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
|
|
337
|
+
lower_is_better: false
|
|
338
|
+
- name: iou_set_match
|
|
339
|
+
display_name: Intersection over union (as sets)
|
|
340
|
+
short_display_name: IoU
|
|
341
|
+
description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
|
|
342
|
+
lower_is_better: false
|
|
343
|
+
|
|
344
|
+
# Summariazation metrics
|
|
345
|
+
- name: summac
|
|
346
|
+
display_name: SummaC
|
|
347
|
+
description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
|
|
348
|
+
lower_is_better: false
|
|
349
|
+
- name: QAFactEval
|
|
350
|
+
display_name: QAFactEval
|
|
351
|
+
description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
|
|
352
|
+
lower_is_better: false
|
|
353
|
+
- name: summarization_coverage
|
|
354
|
+
display_name: Coverage
|
|
355
|
+
description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
|
|
356
|
+
- name: summarization_density
|
|
357
|
+
display_name: Density
|
|
358
|
+
description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
|
|
359
|
+
- name: summarization_compression
|
|
360
|
+
display_name: Compression
|
|
361
|
+
description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
|
|
362
|
+
- name: BERTScore-P
|
|
363
|
+
display_name: BERTScore (P)
|
|
364
|
+
description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
|
|
365
|
+
lower_is_better: false
|
|
366
|
+
- name: BERTScore-R
|
|
367
|
+
display_name: BERTScore (R)
|
|
368
|
+
description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
|
|
369
|
+
lower_is_better: false
|
|
370
|
+
- name: BERTScore-F
|
|
371
|
+
display_name: BERTScore (F1)
|
|
372
|
+
description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
|
|
373
|
+
lower_is_better: false
|
|
374
|
+
- name: HumanEval-faithfulness
|
|
375
|
+
display_name: HumanEval-faithfulness
|
|
376
|
+
description: Human evaluation score for faithfulness.
|
|
377
|
+
lower_is_better: false
|
|
378
|
+
- name: HumanEval-relevance
|
|
379
|
+
display_name: HumanEval-relevance
|
|
380
|
+
description: Human evaluation score for relevance.
|
|
381
|
+
lower_is_better: false
|
|
382
|
+
- name: HumanEval-coherence
|
|
383
|
+
display_name: HumanEval-coherence
|
|
384
|
+
description: Human evaluation score for coherence.
|
|
385
|
+
lower_is_better: false
|
|
386
|
+
|
|
387
|
+
# Bias metrics:
|
|
388
|
+
- name: bias_metric:mode=associations,demographic_category=race,target_category=profession
|
|
389
|
+
display_name: Stereotypical associations (race, profession)
|
|
390
|
+
short_display_name: Stereotypes (race)
|
|
391
|
+
lower_is_better: true
|
|
392
|
+
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
393
|
+
- name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
|
|
394
|
+
display_name: Stereotypical associations (race, adjectives)
|
|
395
|
+
short_display_name: Stereotypes (race)
|
|
396
|
+
lower_is_better: true
|
|
397
|
+
description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
398
|
+
|
|
399
|
+
- name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
|
|
400
|
+
display_name: Stereotypical associations (gender, profession)
|
|
401
|
+
short_display_name: Stereotypes (gender)
|
|
402
|
+
lower_is_better: true
|
|
403
|
+
description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
404
|
+
- name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
|
|
405
|
+
display_name: Stereotypical associations (gender, adjectives)
|
|
406
|
+
short_display_name: Stereotypes (gender)
|
|
407
|
+
lower_is_better: true
|
|
408
|
+
description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
409
|
+
|
|
410
|
+
- name: bias_metric:mode=representation,demographic_category=race
|
|
411
|
+
display_name: Demographic representation (race)
|
|
412
|
+
short_display_name: Representation (race)
|
|
413
|
+
lower_is_better: true
|
|
414
|
+
description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
|
|
415
|
+
- name: bias_metric:mode=representation,demographic_category=gender
|
|
416
|
+
display_name: Demographic representation (gender)
|
|
417
|
+
short_display_name: Representation (gender)
|
|
418
|
+
lower_is_better: true
|
|
419
|
+
description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
|
|
420
|
+
|
|
421
|
+
- name: bbq_metric_ambiguous_bias
|
|
422
|
+
display_name: BBQ (ambiguous)
|
|
423
|
+
lower_is_better: true
|
|
424
|
+
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
|
|
425
|
+
- name: bbq_metric_unambiguous_bias
|
|
426
|
+
display_name: BBQ (unambiguous)
|
|
427
|
+
lower_is_better: true
|
|
428
|
+
description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
|
|
429
|
+
|
|
430
|
+
# Toxicity metrics
|
|
431
|
+
- name: expected_max_toxicity
|
|
432
|
+
display_name: Average max toxicity
|
|
433
|
+
short_display_name: Avg. max toxicity
|
|
434
|
+
lower_is_better: true
|
|
435
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
436
|
+
- name: max_toxicity_probability
|
|
437
|
+
display_name: max toxicity
|
|
438
|
+
lower_is_better: true
|
|
439
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
440
|
+
- name: toxic_frac
|
|
441
|
+
display_name: Toxic fraction
|
|
442
|
+
lower_is_better: true
|
|
443
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
444
|
+
|
|
445
|
+
# Efficiency metrics
|
|
446
|
+
- name: training_co2_cost
|
|
447
|
+
display_name: Estimated training emissions (kg CO2)
|
|
448
|
+
short_display_name: Training emissions (kg CO2)
|
|
449
|
+
lower_is_better: true
|
|
450
|
+
description: Estimate of the CO2 emissions from training the model.
|
|
451
|
+
- name: training_energy_cost
|
|
452
|
+
display_name: Estimated training energy cost (MWh)
|
|
453
|
+
short_display_name: Training energy (MWh)
|
|
454
|
+
lower_is_better: true
|
|
455
|
+
description: Estimate of the amount of energy used to train the model.
|
|
456
|
+
- name: inference_runtime
|
|
457
|
+
display_name: Observed inference runtime (s)
|
|
458
|
+
short_display_name: Observed inference time (s)
|
|
459
|
+
lower_is_better: true
|
|
460
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
461
|
+
- name: inference_idealized_runtime
|
|
462
|
+
display_name: Idealized inference runtime (s)
|
|
463
|
+
short_display_name: Idealized inference time (s)
|
|
464
|
+
lower_is_better: true
|
|
465
|
+
description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
|
|
466
|
+
- name: inference_denoised_runtime
|
|
467
|
+
display_name: Denoised inference runtime (s)
|
|
468
|
+
short_display_name: Denoised inference time (s)
|
|
469
|
+
lower_is_better: true
|
|
470
|
+
description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
|
|
471
|
+
- name: batch_size
|
|
472
|
+
display_name: Batch size
|
|
473
|
+
description: For batch jobs, how many requests are in a batch.
|
|
474
|
+
|
|
475
|
+
# Calibration metrics:
|
|
476
|
+
- name: max_prob
|
|
477
|
+
display_name: Max prob
|
|
478
|
+
description: Model's average confidence in its prediction (only computed for classification tasks)
|
|
479
|
+
lower_is_better: false
|
|
480
|
+
- name: ece_10_bin
|
|
481
|
+
display_name: 10-bin expected calibration error
|
|
482
|
+
short_display_name: ECE (10-bin)
|
|
483
|
+
lower_is_better: true
|
|
484
|
+
description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
|
|
485
|
+
- name: ece_1_bin
|
|
486
|
+
display_name: 1-bin expected calibration error
|
|
487
|
+
short_display_name: ECE (1-bin)
|
|
488
|
+
lower_is_better: true
|
|
489
|
+
description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
|
|
490
|
+
- name: selective_cov_acc_area
|
|
491
|
+
display_name: Selective coverage-accuracy area
|
|
492
|
+
short_display_name: Selective Acc
|
|
493
|
+
description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
|
|
494
|
+
lower_is_better: false
|
|
495
|
+
- name: selective_acc@10
|
|
496
|
+
display_name: Accuracy at 10% coverage
|
|
497
|
+
short_display_name: Acc@10%
|
|
498
|
+
description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
|
|
499
|
+
lower_is_better: false
|
|
500
|
+
- name: platt_ece_10_bin
|
|
501
|
+
display_name: 10-bin Expected Calibration Error (after Platt scaling)
|
|
502
|
+
short_display_name: Platt-scaled ECE (10-bin)
|
|
503
|
+
lower_is_better: true
|
|
504
|
+
description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
|
|
505
|
+
- name: platt_ece_1_bin
|
|
506
|
+
display_name: 1-bin expected calibration error (after Platt scaling)
|
|
507
|
+
short_display_name: Platt-scaled ECE (1-bin)
|
|
508
|
+
lower_is_better: true
|
|
509
|
+
description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
|
|
510
|
+
- name: platt_coef
|
|
511
|
+
display_name: Platt Scaling Coefficient
|
|
512
|
+
short_display_name: Platt Coef
|
|
513
|
+
description: Coefficient of the Platt scaling classifier (can compare this across tasks).
|
|
514
|
+
lower_is_better: false
|
|
515
|
+
- name: platt_intercept
|
|
516
|
+
display_name: Platt Scaling Intercept
|
|
517
|
+
short_display_name: Platt Intercept
|
|
518
|
+
description: Intercept of the Platt scaling classifier (can compare this across tasks).
|
|
519
|
+
lower_is_better: false
|
|
520
|
+
|
|
521
|
+
############################################################
|
|
522
|
+
perturbations:
|
|
523
|
+
- name: robustness
|
|
524
|
+
display_name: Robustness
|
|
525
|
+
description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
|
|
526
|
+
- name: fairness
|
|
527
|
+
display_name: Fairness
|
|
528
|
+
description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
|
|
529
|
+
- name: typos
|
|
530
|
+
display_name: Typos
|
|
531
|
+
description: >
|
|
532
|
+
Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
|
|
533
|
+
performance between perturbed and unperturbed versions.
|
|
534
|
+
- name: synonym
|
|
535
|
+
display_name: Synonyms
|
|
536
|
+
description: >
|
|
537
|
+
Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
|
|
538
|
+
worst-case performance between perturbed and unperturbed versions.
|
|
539
|
+
- name: dialect
|
|
540
|
+
display_name: SAE -> AAE
|
|
541
|
+
short_display_name: Dialect
|
|
542
|
+
description: >
|
|
543
|
+
Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
|
|
544
|
+
- name: race
|
|
545
|
+
display_name: First names by race (White -> Black)
|
|
546
|
+
short_display_name: Race
|
|
547
|
+
description: >
|
|
548
|
+
Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
|
|
549
|
+
- name: gender
|
|
550
|
+
display_name: Pronouns by gender (Male -> Female)
|
|
551
|
+
short_display_name: Gender
|
|
552
|
+
description: >
|
|
553
|
+
Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
|
|
554
|
+
performance between perturbed and unperturbed versions.
|
|
555
|
+
|
|
556
|
+
############################################################
|
|
557
|
+
metric_groups:
|
|
558
|
+
- name: accuracy
|
|
559
|
+
display_name: Accuracy
|
|
560
|
+
metrics:
|
|
561
|
+
- name: ${main_name}
|
|
562
|
+
split: ${main_split}
|
|
563
|
+
|
|
564
|
+
- name: calibration
|
|
565
|
+
display_name: Calibration
|
|
566
|
+
metrics:
|
|
567
|
+
- name: ece_10_bin
|
|
568
|
+
split: ${main_split}
|
|
569
|
+
|
|
570
|
+
- name: calibration_detailed
|
|
571
|
+
display_name: Calibration (Detailed)
|
|
572
|
+
description: Measures how calibrated the model is (how meaningful its uncertainty estimates are).
|
|
573
|
+
metrics:
|
|
574
|
+
- name: max_prob
|
|
575
|
+
split: ${main_split}
|
|
576
|
+
- name: ece_1_bin
|
|
577
|
+
split: ${main_split}
|
|
578
|
+
- name: ece_10_bin
|
|
579
|
+
split: ${main_split}
|
|
580
|
+
- name: selective_cov_acc_area
|
|
581
|
+
split: ${main_split}
|
|
582
|
+
- name: selective_acc@10
|
|
583
|
+
split: ${main_split}
|
|
584
|
+
- name: platt_ece_1_bin
|
|
585
|
+
split: ${main_split}
|
|
586
|
+
- name: platt_ece_10_bin
|
|
587
|
+
split: ${main_split}
|
|
588
|
+
- name: platt_coef
|
|
589
|
+
split: ${main_split}
|
|
590
|
+
- name: platt_intercept
|
|
591
|
+
split: ${main_split}
|
|
592
|
+
|
|
593
|
+
- name: robustness
|
|
594
|
+
display_name: Robustness
|
|
595
|
+
metrics:
|
|
596
|
+
- name: ${main_name}
|
|
597
|
+
split: ${main_split}
|
|
598
|
+
perturbation_name: robustness
|
|
599
|
+
|
|
600
|
+
# TODO: Add other robustness perturbations
|
|
601
|
+
- name: robustness_detailed
|
|
602
|
+
display_name: Robustness (Detailed)
|
|
603
|
+
description: Measures how robust the model is to invariances.
|
|
604
|
+
metrics:
|
|
605
|
+
- name: ${main_name}
|
|
606
|
+
split: ${main_split}
|
|
607
|
+
perturbation_name: typos
|
|
608
|
+
- name: ${main_name}
|
|
609
|
+
split: ${main_split}
|
|
610
|
+
perturbation_name: synonyms
|
|
611
|
+
|
|
612
|
+
- name: fairness
|
|
613
|
+
display_name: Fairness
|
|
614
|
+
metrics:
|
|
615
|
+
- name: ${main_name}
|
|
616
|
+
split: ${main_split}
|
|
617
|
+
perturbation_name: fairness
|
|
618
|
+
|
|
619
|
+
# TODO: Add other fairness perturbations
|
|
620
|
+
- name: fairness_detailed
|
|
621
|
+
display_name: Fairness (Detailed)
|
|
622
|
+
description: Measures how fair the model is.
|
|
623
|
+
metrics:
|
|
624
|
+
- name: ${main_name}
|
|
625
|
+
split: ${main_split}
|
|
626
|
+
perturbation_name: dialect
|
|
627
|
+
- name: ${main_name}
|
|
628
|
+
split: ${main_split}
|
|
629
|
+
perturbation_name: race
|
|
630
|
+
- name: ${main_name}
|
|
631
|
+
split: ${main_split}
|
|
632
|
+
perturbation_name: gender
|
|
633
|
+
|
|
634
|
+
- name: bias
|
|
635
|
+
display_name: Bias
|
|
636
|
+
metrics:
|
|
637
|
+
- name: bias_metric:mode=associations,demographic_category=race,target_category=profession
|
|
638
|
+
split: ${main_split}
|
|
639
|
+
- name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
|
|
640
|
+
split: ${main_split}
|
|
641
|
+
- name: bias_metric:mode=representation,demographic_category=race
|
|
642
|
+
split: ${main_split}
|
|
643
|
+
- name: bias_metric:mode=representation,demographic_category=gender
|
|
644
|
+
split: ${main_split}
|
|
645
|
+
|
|
646
|
+
- name: toxicity
|
|
647
|
+
display_name: Toxicity
|
|
648
|
+
metrics:
|
|
649
|
+
- name: toxic_frac
|
|
650
|
+
split: ${main_split}
|
|
651
|
+
|
|
652
|
+
- name: efficiency
|
|
653
|
+
display_name: Efficiency
|
|
654
|
+
metrics:
|
|
655
|
+
- name: inference_denoised_runtime
|
|
656
|
+
split: ${main_split}
|
|
657
|
+
|
|
658
|
+
- name: efficiency_detailed
|
|
659
|
+
display_name: Efficiency (Detailed)
|
|
660
|
+
description: The efficiency of the model across both training and inference.
|
|
661
|
+
metrics:
|
|
662
|
+
- name: inference_runtime
|
|
663
|
+
split: ${main_split}
|
|
664
|
+
- name: inference_idealized_runtime
|
|
665
|
+
split: ${main_split}
|
|
666
|
+
- name: inference_denoised_runtime
|
|
667
|
+
split: ${main_split}
|
|
668
|
+
- name: training_co2_cost
|
|
669
|
+
split: ${main_split}
|
|
670
|
+
- name: training_energy_cost
|
|
671
|
+
split: ${main_split}
|
|
672
|
+
|
|
673
|
+
- name: general_information
|
|
674
|
+
display_name: General information
|
|
675
|
+
metrics:
|
|
676
|
+
- name: num_instances
|
|
677
|
+
split: ${main_split}
|
|
678
|
+
- name: num_train_instances
|
|
679
|
+
split: ${main_split}
|
|
680
|
+
- name: prompt_truncated
|
|
681
|
+
split: ${main_split}
|
|
682
|
+
- name: num_prompt_tokens
|
|
683
|
+
split: ${main_split}
|
|
684
|
+
- name: num_output_tokens
|
|
685
|
+
split: ${main_split}
|
|
686
|
+
- name: num_train_trials
|
|
687
|
+
split: ${main_split}
|
|
688
|
+
|
|
689
|
+
# Special metrics for scenarios with more than 1 main metric
|
|
690
|
+
- name: summarization_metrics
|
|
691
|
+
display_name: Summarization metrics
|
|
692
|
+
metrics:
|
|
693
|
+
- name: summac
|
|
694
|
+
split: ${main_split}
|
|
695
|
+
- name: QAFactEval
|
|
696
|
+
split: ${main_split}
|
|
697
|
+
- name: BERTScore-F
|
|
698
|
+
split: ${main_split}
|
|
699
|
+
- name: summarization_coverage
|
|
700
|
+
split: ${main_split}
|
|
701
|
+
- name: summarization_density
|
|
702
|
+
split: ${main_split}
|
|
703
|
+
- name: summarization_compression
|
|
704
|
+
split: ${main_split}
|
|
705
|
+
- name: HumanEval-faithfulness
|
|
706
|
+
split: ${main_split}
|
|
707
|
+
- name: HumanEval-relevance
|
|
708
|
+
split: ${main_split}
|
|
709
|
+
- name: HumanEval-coherence
|
|
710
|
+
split: ${main_split}
|
|
711
|
+
|
|
712
|
+
- name: classification_metrics
|
|
713
|
+
display_name: Classification metrics
|
|
714
|
+
metrics:
|
|
715
|
+
- name: classification_macro_f1
|
|
716
|
+
split: ${main_split}
|
|
717
|
+
- name: classification_micro_f1
|
|
718
|
+
split: ${main_split}
|
|
719
|
+
|
|
720
|
+
#######################################################
|
|
721
|
+
run_groups:
|
|
722
|
+
- name: melt
|
|
723
|
+
display_name: MELT Scenarios
|
|
724
|
+
description: Scenarios for the medical domain
|
|
725
|
+
category: All scenarios
|
|
726
|
+
subgroups:
|
|
727
|
+
- melt_question_answering_mlqa
|
|
728
|
+
- melt_question_answering_xquad
|
|
729
|
+
- melt_summarization_vietnews
|
|
730
|
+
- melt_summarization_wikilingua
|
|
731
|
+
- melt_synthetic_reasoning
|
|
732
|
+
- melt_math
|
|
733
|
+
- melt_text_classification_vsmec
|
|
734
|
+
- melt_text_classification_phoatis
|
|
735
|
+
- melt_sentiment_analysis_vlsp
|
|
736
|
+
- melt_sentiment_analysis_vsfc
|
|
737
|
+
- melt_translation_opus100
|
|
738
|
+
- melt_translation_phomt
|
|
739
|
+
- melt_lm_mask_filling_mlqa
|
|
740
|
+
- melt_lm_spelling_correction_vsec
|
|
741
|
+
- melt_knowledge_zalo
|
|
742
|
+
- melt_knowledge_vimmrc
|
|
743
|
+
- melt_toxicity_detection_vihsd
|
|
744
|
+
- melt_toxicity_detection_victsd
|
|
745
|
+
- melt_information_retrieval_mmarco
|
|
746
|
+
- melt_information_retrieval_mrobust
|
|
747
|
+
|
|
748
|
+
- name: melt_question_answering_mlqa
|
|
749
|
+
display_name: MLQA
|
|
750
|
+
description: Scenarios for question answering with the MLQA dataset.
|
|
751
|
+
category: Question Answering
|
|
752
|
+
metric_groups:
|
|
753
|
+
- accuracy
|
|
754
|
+
- efficiency
|
|
755
|
+
- general_information
|
|
756
|
+
environment:
|
|
757
|
+
main_name: quasi_exact_match
|
|
758
|
+
main_split: test
|
|
759
|
+
taxonomy:
|
|
760
|
+
task: question answering
|
|
761
|
+
what: "?"
|
|
762
|
+
who: "?"
|
|
763
|
+
when: "?"
|
|
764
|
+
language: Vietnamese
|
|
765
|
+
|
|
766
|
+
- name: melt_question_answering_xquad
|
|
767
|
+
display_name: XQuAD
|
|
768
|
+
description: Scenarios for question answering with the XQuAD dataset.
|
|
769
|
+
category: Question Answering
|
|
770
|
+
metric_groups:
|
|
771
|
+
- accuracy
|
|
772
|
+
- efficiency
|
|
773
|
+
- general_information
|
|
774
|
+
environment:
|
|
775
|
+
main_name: quasi_exact_match
|
|
776
|
+
main_split: test
|
|
777
|
+
taxonomy:
|
|
778
|
+
task: question answering
|
|
779
|
+
what: "?"
|
|
780
|
+
who: "?"
|
|
781
|
+
when: "?"
|
|
782
|
+
language: Vietnamese
|
|
783
|
+
|
|
784
|
+
- name: melt_summarization_vietnews
|
|
785
|
+
display_name: VietNews
|
|
786
|
+
description: Scenarios for summarization with the VietNews dataset.
|
|
787
|
+
category: Summarization
|
|
788
|
+
metric_groups:
|
|
789
|
+
- accuracy
|
|
790
|
+
- summarization_metrics
|
|
791
|
+
- bias
|
|
792
|
+
- toxicity
|
|
793
|
+
- efficiency
|
|
794
|
+
- general_information
|
|
795
|
+
environment:
|
|
796
|
+
main_name: rouge_2
|
|
797
|
+
main_split: test
|
|
798
|
+
taxonomy:
|
|
799
|
+
task: summarization
|
|
800
|
+
what: "Vietnamese online newspapers."
|
|
801
|
+
who: "?"
|
|
802
|
+
when: "?"
|
|
803
|
+
language: Vietnamese
|
|
804
|
+
|
|
805
|
+
- name: melt_summarization_wikilingua
|
|
806
|
+
display_name: WikiLingua
|
|
807
|
+
description: Scenarios for summarization with the WikiLingua dataset.
|
|
808
|
+
category: Summarization
|
|
809
|
+
metric_groups:
|
|
810
|
+
- accuracy
|
|
811
|
+
- summarization_metrics
|
|
812
|
+
- bias
|
|
813
|
+
- toxicity
|
|
814
|
+
- efficiency
|
|
815
|
+
- general_information
|
|
816
|
+
environment:
|
|
817
|
+
main_name: rouge_2
|
|
818
|
+
main_split: test
|
|
819
|
+
taxonomy:
|
|
820
|
+
task: summarization
|
|
821
|
+
what: "?"
|
|
822
|
+
who: "?"
|
|
823
|
+
when: "?"
|
|
824
|
+
language: Vietnamese
|
|
825
|
+
|
|
826
|
+
- name: melt_synthetic_reasoning
|
|
827
|
+
display_name: Synthetic reasoning (abstract symbols)
|
|
828
|
+
description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
829
|
+
metric_groups:
|
|
830
|
+
- accuracy
|
|
831
|
+
- efficiency
|
|
832
|
+
- general_information
|
|
833
|
+
environment:
|
|
834
|
+
main_name: quasi_exact_match
|
|
835
|
+
main_split: test
|
|
836
|
+
taxonomy:
|
|
837
|
+
task: "reasoning"
|
|
838
|
+
what: n/a
|
|
839
|
+
who: n/a
|
|
840
|
+
when: n/a
|
|
841
|
+
language: synthetic
|
|
842
|
+
subgroups:
|
|
843
|
+
- melt_synthetic_reasoning_pattern_match
|
|
844
|
+
- melt_synthetic_reasoning_variable_substitution
|
|
845
|
+
- melt_synthetic_reasoning_induction
|
|
846
|
+
|
|
847
|
+
- name: melt_synthetic_reasoning_pattern_match
|
|
848
|
+
display_name: Synthetic reasoning (pattern match)
|
|
849
|
+
description: Synthetic reasoning tasks defined using pattern matching based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
850
|
+
metric_groups:
|
|
851
|
+
- accuracy
|
|
852
|
+
- efficiency
|
|
853
|
+
- general_information
|
|
854
|
+
environment:
|
|
855
|
+
main_name: quasi_exact_match
|
|
856
|
+
main_split: test
|
|
857
|
+
taxonomy:
|
|
858
|
+
task: "reasoning"
|
|
859
|
+
what: n/a
|
|
860
|
+
who: n/a
|
|
861
|
+
when: n/a
|
|
862
|
+
language: synthetic
|
|
863
|
+
|
|
864
|
+
- name: melt_synthetic_reasoning_variable_substitution
|
|
865
|
+
display_name: Synthetic reasoning (variable substitution)
|
|
866
|
+
description: Synthetic reasoning tasks defined using variable substitution based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
867
|
+
metric_groups:
|
|
868
|
+
- accuracy
|
|
869
|
+
- efficiency
|
|
870
|
+
- general_information
|
|
871
|
+
environment:
|
|
872
|
+
main_name: quasi_exact_match
|
|
873
|
+
main_split: test
|
|
874
|
+
taxonomy:
|
|
875
|
+
task: "reasoning"
|
|
876
|
+
what: n/a
|
|
877
|
+
who: n/a
|
|
878
|
+
when: n/a
|
|
879
|
+
language: synthetic
|
|
880
|
+
|
|
881
|
+
- name: melt_synthetic_reasoning_induction
|
|
882
|
+
display_name: Synthetic reasoning (induction)
|
|
883
|
+
description: Synthetic reasoning tasks defined using induction based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
884
|
+
metric_groups:
|
|
885
|
+
- accuracy
|
|
886
|
+
- efficiency
|
|
887
|
+
- general_information
|
|
888
|
+
environment:
|
|
889
|
+
main_name: quasi_exact_match
|
|
890
|
+
main_split: test
|
|
891
|
+
taxonomy:
|
|
892
|
+
task: "reasoning"
|
|
893
|
+
what: n/a
|
|
894
|
+
who: n/a
|
|
895
|
+
when: n/a
|
|
896
|
+
language: synthetic
|
|
897
|
+
|
|
898
|
+
- name: melt_synthetic_reasoning_natural
|
|
899
|
+
display_name: Synthetic reasoning (natural language)
|
|
900
|
+
description: Synthetic reasoning tasks defined using simple natural language based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
901
|
+
metric_groups:
|
|
902
|
+
- accuracy
|
|
903
|
+
- efficiency
|
|
904
|
+
- general_information
|
|
905
|
+
environment:
|
|
906
|
+
main_name: f1_set_match
|
|
907
|
+
main_split: test
|
|
908
|
+
taxonomy:
|
|
909
|
+
task: "reasoning"
|
|
910
|
+
what: n/a
|
|
911
|
+
who: n/a
|
|
912
|
+
when: n/a
|
|
913
|
+
language: synthetic
|
|
914
|
+
|
|
915
|
+
- name: melt_math
|
|
916
|
+
display_name: MATH
|
|
917
|
+
description: The MATH benchmark for measuring mathematical problem solving on competition math problems [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
|
|
918
|
+
metric_groups:
|
|
919
|
+
- accuracy
|
|
920
|
+
- efficiency
|
|
921
|
+
- general_information
|
|
922
|
+
subgroups:
|
|
923
|
+
- melt_math_regular
|
|
924
|
+
- melt_math_chain_of_thought
|
|
925
|
+
|
|
926
|
+
- name: melt_math_regular
|
|
927
|
+
display_name: MATH
|
|
928
|
+
description: The MATH benchmark for measuring mathematical problem solving on competition math problems [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
|
|
929
|
+
metric_groups:
|
|
930
|
+
- accuracy
|
|
931
|
+
- efficiency
|
|
932
|
+
- general_information
|
|
933
|
+
environment:
|
|
934
|
+
main_name: math_equiv
|
|
935
|
+
main_split: test
|
|
936
|
+
taxonomy:
|
|
937
|
+
task: "reasoning"
|
|
938
|
+
what: n/a
|
|
939
|
+
who: n/a
|
|
940
|
+
when: n/a
|
|
941
|
+
language: synthetic
|
|
942
|
+
|
|
943
|
+
- name: melt_math_chain_of_thought
|
|
944
|
+
display_name: MATH (chain-of-thought)
|
|
945
|
+
description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
|
|
946
|
+
metric_groups:
|
|
947
|
+
- accuracy
|
|
948
|
+
- efficiency
|
|
949
|
+
- general_information
|
|
950
|
+
environment:
|
|
951
|
+
main_name: math_equiv_chain_of_thought
|
|
952
|
+
main_split: test
|
|
953
|
+
taxonomy:
|
|
954
|
+
task: "reasoning"
|
|
955
|
+
what: n/a
|
|
956
|
+
who: n/a
|
|
957
|
+
when: n/a
|
|
958
|
+
language: synthetic
|
|
959
|
+
|
|
960
|
+
- name: melt_text_classification_phoatis
|
|
961
|
+
display_name: PhoATIS
|
|
962
|
+
short_display_name: PhoATIS
|
|
963
|
+
description: The PhoATIS benchmark for measuring text classification on Vietnamese ATIS.
|
|
964
|
+
metric_groups:
|
|
965
|
+
- accuracy
|
|
966
|
+
- calibration
|
|
967
|
+
- robustness
|
|
968
|
+
- fairness
|
|
969
|
+
- bias
|
|
970
|
+
- toxicity
|
|
971
|
+
- efficiency
|
|
972
|
+
- general_information
|
|
973
|
+
environment:
|
|
974
|
+
main_name: quasi_exact_match
|
|
975
|
+
main_split: test
|
|
976
|
+
taxonomy:
|
|
977
|
+
task: text classification
|
|
978
|
+
what: "Flight information."
|
|
979
|
+
who: "?"
|
|
980
|
+
when: "?"
|
|
981
|
+
language: Vietnamese
|
|
982
|
+
|
|
983
|
+
- name: melt_text_classification_vsmec
|
|
984
|
+
display_name: VSMEC
|
|
985
|
+
short_display_name: VSMEC
|
|
986
|
+
description: The VSMEC benchmark for measuring text classification on Vietnamese MSEC.
|
|
987
|
+
metric_groups:
|
|
988
|
+
- accuracy
|
|
989
|
+
- calibration
|
|
990
|
+
- robustness
|
|
991
|
+
- fairness
|
|
992
|
+
- bias
|
|
993
|
+
- toxicity
|
|
994
|
+
- efficiency
|
|
995
|
+
- general_information
|
|
996
|
+
environment:
|
|
997
|
+
main_name: quasi_exact_match
|
|
998
|
+
main_split: test
|
|
999
|
+
taxonomy:
|
|
1000
|
+
task: text classification
|
|
1001
|
+
what: "?"
|
|
1002
|
+
who: "?"
|
|
1003
|
+
when: "?"
|
|
1004
|
+
language: Vietnamese
|
|
1005
|
+
|
|
1006
|
+
- name: melt_sentiment_analysis_vlsp
|
|
1007
|
+
display_name: VLSP
|
|
1008
|
+
short_display_name: VLSP
|
|
1009
|
+
description: The VLSP benchmark for measuring sentiment analysis on Vietnamese VLSP.
|
|
1010
|
+
metric_groups:
|
|
1011
|
+
- accuracy
|
|
1012
|
+
- calibration
|
|
1013
|
+
- robustness
|
|
1014
|
+
- fairness
|
|
1015
|
+
- bias
|
|
1016
|
+
- toxicity
|
|
1017
|
+
- efficiency
|
|
1018
|
+
- general_information
|
|
1019
|
+
environment:
|
|
1020
|
+
main_name: quasi_exact_match
|
|
1021
|
+
main_split: test
|
|
1022
|
+
taxonomy:
|
|
1023
|
+
task: sentiment analysis
|
|
1024
|
+
what: "Online comments"
|
|
1025
|
+
who: "?"
|
|
1026
|
+
when: "?"
|
|
1027
|
+
language: Vietnamese
|
|
1028
|
+
|
|
1029
|
+
- name: melt_sentiment_analysis_vsfc
|
|
1030
|
+
display_name: VSFC
|
|
1031
|
+
short_display_name: VSFC
|
|
1032
|
+
description: The VSFC benchmark for measuring sentiment analysis on Vietnamese VSFC.
|
|
1033
|
+
metric_groups:
|
|
1034
|
+
- accuracy
|
|
1035
|
+
- calibration
|
|
1036
|
+
- robustness
|
|
1037
|
+
- fairness
|
|
1038
|
+
- bias
|
|
1039
|
+
- toxicity
|
|
1040
|
+
- efficiency
|
|
1041
|
+
- general_information
|
|
1042
|
+
environment:
|
|
1043
|
+
main_name: quasi_exact_match
|
|
1044
|
+
main_split: test
|
|
1045
|
+
taxonomy:
|
|
1046
|
+
task: sentiment analysis
|
|
1047
|
+
what: "?"
|
|
1048
|
+
who: "?"
|
|
1049
|
+
when: "?"
|
|
1050
|
+
language: Vietnamese
|
|
1051
|
+
|
|
1052
|
+
- name: melt_translation_opus100
|
|
1053
|
+
display_name: OPUS100
|
|
1054
|
+
short_display_name: OPUS100
|
|
1055
|
+
description: The OPUS100 benchmark for measuring translation on Vietnamese OPUS100.
|
|
1056
|
+
metric_groups:
|
|
1057
|
+
- accuracy
|
|
1058
|
+
- efficiency
|
|
1059
|
+
- general_information
|
|
1060
|
+
environment:
|
|
1061
|
+
main_name: quasi_exact_match
|
|
1062
|
+
main_split: test
|
|
1063
|
+
taxonomy:
|
|
1064
|
+
task: translation
|
|
1065
|
+
what: "?"
|
|
1066
|
+
who: "?"
|
|
1067
|
+
when: "?"
|
|
1068
|
+
language: Vietnamese
|
|
1069
|
+
|
|
1070
|
+
- name: melt_translation_phomt
|
|
1071
|
+
display_name: PhoMT
|
|
1072
|
+
short_display_name: PhoMT
|
|
1073
|
+
description: The PhoMT benchmark for measuring translation on Vietnamese PhoMT.
|
|
1074
|
+
metric_groups:
|
|
1075
|
+
- accuracy
|
|
1076
|
+
- efficiency
|
|
1077
|
+
- general_information
|
|
1078
|
+
environment:
|
|
1079
|
+
main_name: quasi_exact_match
|
|
1080
|
+
main_split: test
|
|
1081
|
+
taxonomy:
|
|
1082
|
+
task: translation
|
|
1083
|
+
what: "?"
|
|
1084
|
+
who: "?"
|
|
1085
|
+
when: "?"
|
|
1086
|
+
language: Vietnamese
|
|
1087
|
+
|
|
1088
|
+
- name: melt_lm_mask_filling_mlqa
|
|
1089
|
+
display_name: MLQA
|
|
1090
|
+
description: The MLQA benchmark for measuring language model mask filling on Vietnamese MLQA.
|
|
1091
|
+
metric_groups:
|
|
1092
|
+
- accuracy
|
|
1093
|
+
- calibration
|
|
1094
|
+
- robustness
|
|
1095
|
+
- fairness
|
|
1096
|
+
- bias
|
|
1097
|
+
- toxicity
|
|
1098
|
+
- efficiency
|
|
1099
|
+
- general_information
|
|
1100
|
+
environment:
|
|
1101
|
+
main_name: quasi_exact_match
|
|
1102
|
+
main_split: test
|
|
1103
|
+
taxonomy:
|
|
1104
|
+
task: language model mask filling
|
|
1105
|
+
what: "?"
|
|
1106
|
+
who: "?"
|
|
1107
|
+
when: "?"
|
|
1108
|
+
language: Vietnamese
|
|
1109
|
+
|
|
1110
|
+
- name: melt_lm_spelling_correction_vsec
|
|
1111
|
+
display_name: VSEC
|
|
1112
|
+
short_display_name: VSEC
|
|
1113
|
+
description: The VSEC benchmark for measuring language model spelling correction on Vietnamese VSEC.
|
|
1114
|
+
metric_groups:
|
|
1115
|
+
- accuracy
|
|
1116
|
+
- calibration
|
|
1117
|
+
- robustness
|
|
1118
|
+
- fairness
|
|
1119
|
+
- bias
|
|
1120
|
+
- toxicity
|
|
1121
|
+
- efficiency
|
|
1122
|
+
- general_information
|
|
1123
|
+
environment:
|
|
1124
|
+
main_name: quasi_exact_match
|
|
1125
|
+
main_split: test
|
|
1126
|
+
taxonomy:
|
|
1127
|
+
task: language model spelling correction
|
|
1128
|
+
what: "?"
|
|
1129
|
+
who: "?"
|
|
1130
|
+
when: "?"
|
|
1131
|
+
language: Vietnamese
|
|
1132
|
+
|
|
1133
|
+
- name: melt_knowledge_zalo
|
|
1134
|
+
display_name: ZaloE2E
|
|
1135
|
+
short_display_name: ZaloE2E
|
|
1136
|
+
description: The ZaloE2E benchmark for measuring knowledge extraction on Vietnamese ZaloE2E.
|
|
1137
|
+
metric_groups:
|
|
1138
|
+
- accuracy
|
|
1139
|
+
- efficiency
|
|
1140
|
+
- general_information
|
|
1141
|
+
environment:
|
|
1142
|
+
main_name: quasi_exact_match
|
|
1143
|
+
main_split: test
|
|
1144
|
+
taxonomy:
|
|
1145
|
+
task: instrinsic knowledge
|
|
1146
|
+
what: "?"
|
|
1147
|
+
who: "?"
|
|
1148
|
+
when: "?"
|
|
1149
|
+
language: Vietnamese
|
|
1150
|
+
|
|
1151
|
+
- name: melt_knowledge_vimmrc
|
|
1152
|
+
display_name: ViMMRC
|
|
1153
|
+
short_display_name: ViMMRC
|
|
1154
|
+
description: The ViMMRC benchmark for measuring knowledge extraction on Vietnamese ViMMRC.
|
|
1155
|
+
metric_groups:
|
|
1156
|
+
- accuracy
|
|
1157
|
+
- efficiency
|
|
1158
|
+
- general_information
|
|
1159
|
+
environment:
|
|
1160
|
+
main_name: quasi_exact_match
|
|
1161
|
+
main_split: test
|
|
1162
|
+
taxonomy:
|
|
1163
|
+
task: instrinsic knowledge
|
|
1164
|
+
what: "?"
|
|
1165
|
+
who: "?"
|
|
1166
|
+
when: "?"
|
|
1167
|
+
language: Vietnamese
|
|
1168
|
+
|
|
1169
|
+
- name: melt_toxicity_detection_vihsd
|
|
1170
|
+
display_name: ViHSD
|
|
1171
|
+
short_display_name: ViHSD
|
|
1172
|
+
description: The ViHSD benchmark for measuring toxicity detection on Vietnamese ViHSD.
|
|
1173
|
+
metric_groups:
|
|
1174
|
+
- accuracy
|
|
1175
|
+
- calibration
|
|
1176
|
+
- robustness
|
|
1177
|
+
- fairness
|
|
1178
|
+
- bias
|
|
1179
|
+
- toxicity
|
|
1180
|
+
- efficiency
|
|
1181
|
+
- general_information
|
|
1182
|
+
environment:
|
|
1183
|
+
main_name: quasi_exact_match
|
|
1184
|
+
main_split: test
|
|
1185
|
+
taxonomy:
|
|
1186
|
+
task: toxicity classification
|
|
1187
|
+
what: "?"
|
|
1188
|
+
who: "?"
|
|
1189
|
+
when: "?"
|
|
1190
|
+
language: Vietnamese
|
|
1191
|
+
|
|
1192
|
+
- name: melt_toxicity_detection_victsd
|
|
1193
|
+
display_name: ViCTSD
|
|
1194
|
+
short_display_name: ViCTSD
|
|
1195
|
+
description: The ViCTSD benchmark for measuring toxicity detection on Vietnamese ViCTSD.
|
|
1196
|
+
metric_groups:
|
|
1197
|
+
- accuracy
|
|
1198
|
+
- calibration
|
|
1199
|
+
- robustness
|
|
1200
|
+
- fairness
|
|
1201
|
+
- bias
|
|
1202
|
+
- toxicity
|
|
1203
|
+
- efficiency
|
|
1204
|
+
- general_information
|
|
1205
|
+
environment:
|
|
1206
|
+
main_name: quasi_exact_match
|
|
1207
|
+
main_split: test
|
|
1208
|
+
taxonomy:
|
|
1209
|
+
task: toxicity classification
|
|
1210
|
+
what: "?"
|
|
1211
|
+
who: "?"
|
|
1212
|
+
when: "?"
|
|
1213
|
+
language: Vietnamese
|
|
1214
|
+
|
|
1215
|
+
- name: melt_information_retrieval_mmarco
|
|
1216
|
+
display_name: MARCO
|
|
1217
|
+
short_display_name: MARCO
|
|
1218
|
+
description: The MARCO benchmark for measuring information retrieval on Vietnamese MARCO.
|
|
1219
|
+
metric_groups:
|
|
1220
|
+
- accuracy
|
|
1221
|
+
- robustness
|
|
1222
|
+
- fairness
|
|
1223
|
+
- bias
|
|
1224
|
+
- toxicity
|
|
1225
|
+
- efficiency
|
|
1226
|
+
- general_information
|
|
1227
|
+
environment:
|
|
1228
|
+
main_name: RR@10
|
|
1229
|
+
main_split: valid
|
|
1230
|
+
taxonomy:
|
|
1231
|
+
task: information retrieval
|
|
1232
|
+
what: "?"
|
|
1233
|
+
who: "?"
|
|
1234
|
+
when: "?"
|
|
1235
|
+
language: Vietnamese
|
|
1236
|
+
|
|
1237
|
+
- name: melt_information_retrieval_mrobust
|
|
1238
|
+
display_name: MRobust
|
|
1239
|
+
short_display_name: MRobust
|
|
1240
|
+
description: The MRobust benchmark for measuring information retrieval on Vietnamese MRobust.
|
|
1241
|
+
metric_groups:
|
|
1242
|
+
- accuracy
|
|
1243
|
+
- robustness
|
|
1244
|
+
- fairness
|
|
1245
|
+
- bias
|
|
1246
|
+
- toxicity
|
|
1247
|
+
- efficiency
|
|
1248
|
+
- general_information
|
|
1249
|
+
environment:
|
|
1250
|
+
main_name: NDCG@10
|
|
1251
|
+
main_split: valid
|
|
1252
|
+
taxonomy:
|
|
1253
|
+
task: information retrieval
|
|
1254
|
+
what: "?"
|
|
1255
|
+
who: "?"
|
|
1256
|
+
when: "?"
|
|
1257
|
+
language: Vietnamese
|