crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,1449 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
# For backwards compatibility with older versions of HELM.
|
|
4
|
+
# TODO: Remove this after 2024-09-01.
|
|
5
|
+
adapter: []
|
|
6
|
+
############################################################
|
|
7
|
+
metrics:
|
|
8
|
+
# Infrastructure metrics:
|
|
9
|
+
- name: num_perplexity_tokens
|
|
10
|
+
display_name: '# tokens'
|
|
11
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
12
|
+
- name: num_bytes
|
|
13
|
+
display_name: '# bytes'
|
|
14
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
15
|
+
|
|
16
|
+
- name: num_references
|
|
17
|
+
display_name: '# ref'
|
|
18
|
+
description: Number of references.
|
|
19
|
+
- name: num_train_trials
|
|
20
|
+
display_name: '# trials'
|
|
21
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
22
|
+
- name: estimated_num_tokens_cost
|
|
23
|
+
display_name: 'cost'
|
|
24
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
25
|
+
- name: num_prompt_tokens
|
|
26
|
+
display_name: '# prompt tokens'
|
|
27
|
+
description: Number of tokens in the prompt.
|
|
28
|
+
- name: num_prompt_characters
|
|
29
|
+
display_name: '# prompt chars'
|
|
30
|
+
description: Number of characters in the prompt.
|
|
31
|
+
- name: num_completion_tokens
|
|
32
|
+
display_name: '# completion tokens'
|
|
33
|
+
description: Actual number of completion tokens (over all completions).
|
|
34
|
+
- name: num_output_tokens
|
|
35
|
+
display_name: '# output tokens'
|
|
36
|
+
description: Actual number of output tokens.
|
|
37
|
+
- name: max_num_output_tokens
|
|
38
|
+
display_name: 'Max output tokens'
|
|
39
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
40
|
+
- name: num_requests
|
|
41
|
+
display_name: '# requests'
|
|
42
|
+
description: Number of distinct API requests.
|
|
43
|
+
- name: num_instances
|
|
44
|
+
display_name: '# eval'
|
|
45
|
+
description: Number of evaluation instances.
|
|
46
|
+
- name: num_train_instances
|
|
47
|
+
display_name: '# train'
|
|
48
|
+
description: Number of training instances (e.g., in-context examples).
|
|
49
|
+
- name: prompt_truncated
|
|
50
|
+
display_name: truncated
|
|
51
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
52
|
+
- name: finish_reason_length
|
|
53
|
+
display_name: finish b/c length
|
|
54
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
55
|
+
- name: finish_reason_stop
|
|
56
|
+
display_name: finish b/c stop
|
|
57
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
58
|
+
- name: finish_reason_endoftext
|
|
59
|
+
display_name: finish b/c endoftext
|
|
60
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
61
|
+
- name: finish_reason_unknown
|
|
62
|
+
display_name: finish b/c unknown
|
|
63
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
64
|
+
- name: num_completions
|
|
65
|
+
display_name: '# completions'
|
|
66
|
+
description: Number of completions.
|
|
67
|
+
- name: predicted_index
|
|
68
|
+
display_name: Predicted index
|
|
69
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
70
|
+
|
|
71
|
+
# Accuracy metrics:
|
|
72
|
+
- name: exact_match
|
|
73
|
+
display_name: Exact match
|
|
74
|
+
short_display_name: EM
|
|
75
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: quasi_exact_match
|
|
78
|
+
display_name: Quasi-exact match
|
|
79
|
+
short_display_name: EM
|
|
80
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
81
|
+
lower_is_better: false
|
|
82
|
+
- name: prefix_exact_match
|
|
83
|
+
display_name: Prefix exact match
|
|
84
|
+
short_display_name: PEM
|
|
85
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
86
|
+
lower_is_better: false
|
|
87
|
+
- name: quasi_prefix_exact_match
|
|
88
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
89
|
+
display_name: Prefix quasi-exact match
|
|
90
|
+
short_display_name: PEM
|
|
91
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
92
|
+
lower_is_better: false
|
|
93
|
+
|
|
94
|
+
- name: exact_match@5
|
|
95
|
+
display_name: Exact match @5
|
|
96
|
+
short_display_name: EM@5
|
|
97
|
+
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
|
|
98
|
+
lower_is_better: false
|
|
99
|
+
- name: quasi_exact_match@5
|
|
100
|
+
display_name: Quasi-exact match @5
|
|
101
|
+
short_display_name: EM@5
|
|
102
|
+
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
|
|
103
|
+
lower_is_better: false
|
|
104
|
+
- name: prefix_exact_match@5
|
|
105
|
+
display_name: Prefix exact match @5
|
|
106
|
+
short_display_name: PEM@5
|
|
107
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
|
|
108
|
+
lower_is_better: false
|
|
109
|
+
- name: quasi_prefix_exact_match@5
|
|
110
|
+
display_name: Prefix quasi-exact match @5
|
|
111
|
+
short_display_name: PEM@5
|
|
112
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
|
|
113
|
+
lower_is_better: false
|
|
114
|
+
|
|
115
|
+
- name: logprob
|
|
116
|
+
display_name: Log probability
|
|
117
|
+
short_display_name: Logprob
|
|
118
|
+
description: Predicted output's average log probability (input's log prob for language modeling).
|
|
119
|
+
lower_is_better: false
|
|
120
|
+
- name: logprob_per_byte
|
|
121
|
+
display_name: Log probability / byte
|
|
122
|
+
short_display_name: Logprob/byte
|
|
123
|
+
description: Predicted output's average log probability normalized by the number of bytes.
|
|
124
|
+
lower_is_better: false
|
|
125
|
+
- name: bits_per_byte
|
|
126
|
+
display_name: Bits/byte
|
|
127
|
+
short_display_name: BPB
|
|
128
|
+
lower_is_better: true
|
|
129
|
+
description: Average number of bits per byte according to model probabilities.
|
|
130
|
+
- name: perplexity
|
|
131
|
+
display_name: Perplexity
|
|
132
|
+
short_display_name: PPL
|
|
133
|
+
lower_is_better: true
|
|
134
|
+
description: Perplexity of the output completion (effective branching factor per output token).
|
|
135
|
+
- name: rouge_1
|
|
136
|
+
display_name: ROUGE-1
|
|
137
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
|
|
138
|
+
lower_is_better: false
|
|
139
|
+
- name: rouge_2
|
|
140
|
+
display_name: ROUGE-2
|
|
141
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
|
|
142
|
+
lower_is_better: false
|
|
143
|
+
- name: rouge_l
|
|
144
|
+
display_name: ROUGE-L
|
|
145
|
+
description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
|
|
146
|
+
lower_is_better: false
|
|
147
|
+
- name: bleu_1
|
|
148
|
+
display_name: BLEU-1
|
|
149
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
|
|
150
|
+
lower_is_better: false
|
|
151
|
+
- name: bleu_4
|
|
152
|
+
display_name: BLEU-4
|
|
153
|
+
description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
|
|
154
|
+
lower_is_better: false
|
|
155
|
+
- name: f1_set_match
|
|
156
|
+
display_name: F1 (set match)
|
|
157
|
+
short_display_name: F1
|
|
158
|
+
description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
|
|
159
|
+
lower_is_better: false
|
|
160
|
+
- name: f1_score
|
|
161
|
+
display_name: F1
|
|
162
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
163
|
+
lower_is_better: false
|
|
164
|
+
- name: classification_macro_f1
|
|
165
|
+
display_name: Macro-F1
|
|
166
|
+
description: Population-level macro-averaged F1 score.
|
|
167
|
+
lower_is_better: false
|
|
168
|
+
- name: classification_micro_f1
|
|
169
|
+
display_name: Micro-F1
|
|
170
|
+
description: Population-level micro-averaged F1 score.
|
|
171
|
+
lower_is_better: false
|
|
172
|
+
- name: absolute_value_difference
|
|
173
|
+
display_name: Absolute difference
|
|
174
|
+
short_display_name: Diff.
|
|
175
|
+
lower_is_better: true
|
|
176
|
+
description: Average absolute difference between the model output (converted to a number) and the correct reference.
|
|
177
|
+
- name: distance
|
|
178
|
+
display_name: Geometric distance
|
|
179
|
+
short_display_name: Dist.
|
|
180
|
+
lower_is_better: true
|
|
181
|
+
description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
|
|
182
|
+
- name: percent_valid
|
|
183
|
+
display_name: Valid fraction
|
|
184
|
+
short_display_name: Valid
|
|
185
|
+
description: Fraction of valid model outputs (as a number).
|
|
186
|
+
lower_is_better: false
|
|
187
|
+
- name: NDCG@10
|
|
188
|
+
display_name: NDCG@10
|
|
189
|
+
description: Normalized discounted cumulative gain at 10 in information retrieval.
|
|
190
|
+
lower_is_better: false
|
|
191
|
+
- name: RR@10
|
|
192
|
+
display_name: RR@10
|
|
193
|
+
description: Mean reciprocal rank at 10 in information retrieval.
|
|
194
|
+
lower_is_better: false
|
|
195
|
+
- name: NDCG@20
|
|
196
|
+
display_name: NDCG@20
|
|
197
|
+
description: Normalized discounted cumulative gain at 20 in information retrieval.
|
|
198
|
+
lower_is_better: false
|
|
199
|
+
- name: RR@20
|
|
200
|
+
display_name: RR@20
|
|
201
|
+
description: Mean reciprocal rank at 20 in information retrieval.
|
|
202
|
+
lower_is_better: false
|
|
203
|
+
- name: math_equiv
|
|
204
|
+
display_name: Equivalent
|
|
205
|
+
description: Fraction of model outputs that are mathematically equivalent to the correct reference.
|
|
206
|
+
lower_is_better: false
|
|
207
|
+
- name: math_equiv_chain_of_thought
|
|
208
|
+
display_name: Equivalent (CoT)
|
|
209
|
+
description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
|
|
210
|
+
lower_is_better: false
|
|
211
|
+
- name: exact_match_indicator
|
|
212
|
+
display_name: Exact match (final)
|
|
213
|
+
short_display_name: EM
|
|
214
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
|
|
215
|
+
lower_is_better: false
|
|
216
|
+
- name: final_number_exact_match
|
|
217
|
+
display_name: Exact match (final number)
|
|
218
|
+
short_display_name: EM
|
|
219
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
|
|
220
|
+
lower_is_better: false
|
|
221
|
+
- name: exact_set_match
|
|
222
|
+
display_name: Exact match (at sets)
|
|
223
|
+
short_display_name: EM
|
|
224
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
|
|
225
|
+
lower_is_better: false
|
|
226
|
+
- name: iou_set_match
|
|
227
|
+
display_name: Intersection over union (as sets)
|
|
228
|
+
short_display_name: IoU
|
|
229
|
+
description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
|
|
230
|
+
lower_is_better: false
|
|
231
|
+
|
|
232
|
+
# Efficiency metrics:
|
|
233
|
+
- name: training_co2_cost
|
|
234
|
+
display_name: Estimated training emissions (kg CO2)
|
|
235
|
+
short_display_name: Training emissions (kg CO2)
|
|
236
|
+
lower_is_better: true
|
|
237
|
+
description: Estimate of the CO2 emissions from training the model.
|
|
238
|
+
- name: training_energy_cost
|
|
239
|
+
display_name: Estimated training energy cost (MWh)
|
|
240
|
+
short_display_name: Training energy (MWh)
|
|
241
|
+
lower_is_better: true
|
|
242
|
+
description: Estimate of the amount of energy used to train the model.
|
|
243
|
+
- name: inference_runtime
|
|
244
|
+
display_name: Observed inference runtime (s)
|
|
245
|
+
short_display_name: Observed inference time (s)
|
|
246
|
+
lower_is_better: true
|
|
247
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
248
|
+
- name: inference_idealized_runtime
|
|
249
|
+
display_name: Idealized inference runtime (s)
|
|
250
|
+
short_display_name: Idealized inference time (s)
|
|
251
|
+
lower_is_better: true
|
|
252
|
+
description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
|
|
253
|
+
- name: inference_denoised_runtime
|
|
254
|
+
display_name: Denoised inference runtime (s)
|
|
255
|
+
short_display_name: Denoised inference time (s)
|
|
256
|
+
lower_is_better: true
|
|
257
|
+
description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
|
|
258
|
+
- name: batch_size
|
|
259
|
+
display_name: Batch size
|
|
260
|
+
description: For batch jobs, how many requests are in a batch.
|
|
261
|
+
|
|
262
|
+
# Calibration metrics:
|
|
263
|
+
- name: ece_1_bin
|
|
264
|
+
display_name: 1-bin expected calibration error
|
|
265
|
+
short_display_name: ECE (1-bin)
|
|
266
|
+
lower_is_better: true
|
|
267
|
+
description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
|
|
268
|
+
- name: max_prob
|
|
269
|
+
display_name: Max prob
|
|
270
|
+
description: Model's average confidence in its prediction (only computed for classification tasks)
|
|
271
|
+
lower_is_better: false
|
|
272
|
+
- name: ece_10_bin
|
|
273
|
+
display_name: 10-bin expected calibration error
|
|
274
|
+
short_display_name: ECE (10-bin)
|
|
275
|
+
lower_is_better: true
|
|
276
|
+
description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
|
|
277
|
+
- name: platt_ece_1_bin
|
|
278
|
+
display_name: 1-bin expected calibration error (after Platt scaling)
|
|
279
|
+
short_display_name: Platt-scaled ECE (1-bin)
|
|
280
|
+
lower_is_better: true
|
|
281
|
+
description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
|
|
282
|
+
- name: platt_ece_10_bin
|
|
283
|
+
display_name: 10-bin Expected Calibration Error (after Platt scaling)
|
|
284
|
+
short_display_name: Platt-scaled ECE (10-bin)
|
|
285
|
+
lower_is_better: true
|
|
286
|
+
description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
|
|
287
|
+
- name: platt_coef
|
|
288
|
+
display_name: Platt Scaling Coefficient
|
|
289
|
+
short_display_name: Platt Coef
|
|
290
|
+
description: Coefficient of the Platt scaling classifier (can compare this across tasks).
|
|
291
|
+
lower_is_better: false
|
|
292
|
+
- name: platt_intercept
|
|
293
|
+
display_name: Platt Scaling Intercept
|
|
294
|
+
short_display_name: Platt Intercept
|
|
295
|
+
description: Intercept of the Platt scaling classifier (can compare this across tasks).
|
|
296
|
+
lower_is_better: false
|
|
297
|
+
- name: selective_cov_acc_area
|
|
298
|
+
display_name: Selective coverage-accuracy area
|
|
299
|
+
short_display_name: Selective Acc
|
|
300
|
+
description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
|
|
301
|
+
lower_is_better: false
|
|
302
|
+
- name: selective_acc@10
|
|
303
|
+
display_name: Accuracy at 10% coverage
|
|
304
|
+
short_display_name: Acc@10%
|
|
305
|
+
description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
|
|
306
|
+
lower_is_better: false
|
|
307
|
+
|
|
308
|
+
############################################################
|
|
309
|
+
perturbations: []
|
|
310
|
+
############################################################
|
|
311
|
+
metric_groups:
|
|
312
|
+
- name: accuracy
|
|
313
|
+
display_name: Accuracy
|
|
314
|
+
hide_win_rates: true
|
|
315
|
+
metrics:
|
|
316
|
+
- name: ${main_name}
|
|
317
|
+
split: ${main_split}
|
|
318
|
+
|
|
319
|
+
- name: efficiency
|
|
320
|
+
display_name: Efficiency
|
|
321
|
+
metrics:
|
|
322
|
+
- name: inference_runtime
|
|
323
|
+
split: ${main_split}
|
|
324
|
+
|
|
325
|
+
- name: general_information
|
|
326
|
+
display_name: General information
|
|
327
|
+
hide_win_rates: true
|
|
328
|
+
metrics:
|
|
329
|
+
- name: num_instances
|
|
330
|
+
split: ${main_split}
|
|
331
|
+
- name: num_train_instances
|
|
332
|
+
split: ${main_split}
|
|
333
|
+
- name: prompt_truncated
|
|
334
|
+
split: ${main_split}
|
|
335
|
+
- name: num_prompt_tokens
|
|
336
|
+
split: ${main_split}
|
|
337
|
+
- name: num_output_tokens
|
|
338
|
+
split: ${main_split}
|
|
339
|
+
|
|
340
|
+
############################################################
|
|
341
|
+
run_groups:
|
|
342
|
+
- name: mmlu_subjects
|
|
343
|
+
display_name: MMLU Subjects
|
|
344
|
+
short_display_name: MMLU Subjects
|
|
345
|
+
description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
|
|
346
|
+
category: All Scenarios
|
|
347
|
+
subgroups:
|
|
348
|
+
- mmlu
|
|
349
|
+
- mmlu_abstract_algebra
|
|
350
|
+
- mmlu_anatomy
|
|
351
|
+
- mmlu_college_chemistry
|
|
352
|
+
- mmlu_computer_security
|
|
353
|
+
- mmlu_econometrics
|
|
354
|
+
- mmlu_global_facts
|
|
355
|
+
- mmlu_jurisprudence
|
|
356
|
+
- mmlu_philosophy
|
|
357
|
+
- mmlu_professional_medicine
|
|
358
|
+
- mmlu_us_foreign_policy
|
|
359
|
+
- mmlu_astronomy
|
|
360
|
+
- mmlu_business_ethics
|
|
361
|
+
- mmlu_clinical_knowledge
|
|
362
|
+
- mmlu_college_biology
|
|
363
|
+
- mmlu_college_computer_science
|
|
364
|
+
- mmlu_college_mathematics
|
|
365
|
+
- mmlu_college_medicine
|
|
366
|
+
- mmlu_college_physics
|
|
367
|
+
- mmlu_conceptual_physics
|
|
368
|
+
- mmlu_electrical_engineering
|
|
369
|
+
- mmlu_elementary_mathematics
|
|
370
|
+
- mmlu_formal_logic
|
|
371
|
+
- mmlu_high_school_biology
|
|
372
|
+
- mmlu_high_school_chemistry
|
|
373
|
+
- mmlu_high_school_computer_science
|
|
374
|
+
- mmlu_high_school_european_history
|
|
375
|
+
- mmlu_high_school_geography
|
|
376
|
+
- mmlu_high_school_government_and_politics
|
|
377
|
+
- mmlu_high_school_macroeconomics
|
|
378
|
+
- mmlu_high_school_mathematics
|
|
379
|
+
- mmlu_high_school_microeconomics
|
|
380
|
+
- mmlu_high_school_physics
|
|
381
|
+
- mmlu_high_school_psychology
|
|
382
|
+
- mmlu_high_school_statistics
|
|
383
|
+
- mmlu_high_school_us_history
|
|
384
|
+
- mmlu_high_school_world_history
|
|
385
|
+
- mmlu_human_aging
|
|
386
|
+
- mmlu_human_sexuality
|
|
387
|
+
- mmlu_international_law
|
|
388
|
+
- mmlu_logical_fallacies
|
|
389
|
+
- mmlu_machine_learning
|
|
390
|
+
- mmlu_management
|
|
391
|
+
- mmlu_marketing
|
|
392
|
+
- mmlu_medical_genetics
|
|
393
|
+
- mmlu_miscellaneous
|
|
394
|
+
- mmlu_moral_disputes
|
|
395
|
+
- mmlu_moral_scenarios
|
|
396
|
+
- mmlu_nutrition
|
|
397
|
+
- mmlu_prehistory
|
|
398
|
+
- mmlu_professional_accounting
|
|
399
|
+
- mmlu_professional_law
|
|
400
|
+
- mmlu_professional_psychology
|
|
401
|
+
- mmlu_public_relations
|
|
402
|
+
- mmlu_security_studies
|
|
403
|
+
- mmlu_sociology
|
|
404
|
+
- mmlu_virology
|
|
405
|
+
- mmlu_world_religions
|
|
406
|
+
|
|
407
|
+
- name: mmlu
|
|
408
|
+
display_name: Massive Multitask Language Understanding (MMLU) All Subjects
|
|
409
|
+
short_display_name: MMLU All Subjects
|
|
410
|
+
description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
|
|
411
|
+
metric_groups:
|
|
412
|
+
- accuracy
|
|
413
|
+
- efficiency
|
|
414
|
+
- general_information
|
|
415
|
+
environment:
|
|
416
|
+
main_name: exact_match
|
|
417
|
+
main_split: test
|
|
418
|
+
taxonomy:
|
|
419
|
+
task: multiple-choice question answering
|
|
420
|
+
what: math, science, history, etc.
|
|
421
|
+
who: various online sources
|
|
422
|
+
when: before 2021
|
|
423
|
+
language: English
|
|
424
|
+
|
|
425
|
+
- name: mmlu_abstract_algebra
|
|
426
|
+
display_name: Abstract Algebra
|
|
427
|
+
short_display_name: Abstract Algebra
|
|
428
|
+
description: The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
429
|
+
metric_groups:
|
|
430
|
+
- accuracy
|
|
431
|
+
- efficiency
|
|
432
|
+
- general_information
|
|
433
|
+
environment:
|
|
434
|
+
main_name: exact_match
|
|
435
|
+
main_split: test
|
|
436
|
+
taxonomy:
|
|
437
|
+
task: multiple-choice question answering
|
|
438
|
+
what: abstract algebra
|
|
439
|
+
who: various online sources
|
|
440
|
+
when: before 2021
|
|
441
|
+
language: English
|
|
442
|
+
|
|
443
|
+
- name: mmlu_anatomy
|
|
444
|
+
display_name: Anatomy
|
|
445
|
+
short_display_name: Anatomy
|
|
446
|
+
description: The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
447
|
+
metric_groups:
|
|
448
|
+
- accuracy
|
|
449
|
+
- efficiency
|
|
450
|
+
- general_information
|
|
451
|
+
environment:
|
|
452
|
+
main_name: exact_match
|
|
453
|
+
main_split: test
|
|
454
|
+
taxonomy:
|
|
455
|
+
task: multiple-choice question answering
|
|
456
|
+
what: anatomy
|
|
457
|
+
who: various online sources
|
|
458
|
+
when: before 2021
|
|
459
|
+
language: English
|
|
460
|
+
|
|
461
|
+
- name: mmlu_college_chemistry
|
|
462
|
+
display_name: College Chemistry
|
|
463
|
+
short_display_name: College Chemistry
|
|
464
|
+
description: The college chemistry subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
465
|
+
metric_groups:
|
|
466
|
+
- accuracy
|
|
467
|
+
- efficiency
|
|
468
|
+
- general_information
|
|
469
|
+
environment:
|
|
470
|
+
main_name: exact_match
|
|
471
|
+
main_split: test
|
|
472
|
+
taxonomy:
|
|
473
|
+
task: multiple-choice question answering
|
|
474
|
+
what: college chemistry
|
|
475
|
+
who: various online sources
|
|
476
|
+
when: before 2021
|
|
477
|
+
language: English
|
|
478
|
+
|
|
479
|
+
- name: mmlu_computer_security
|
|
480
|
+
display_name: Computer Security
|
|
481
|
+
short_display_name: Computer Security
|
|
482
|
+
description: The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
483
|
+
metric_groups:
|
|
484
|
+
- accuracy
|
|
485
|
+
- efficiency
|
|
486
|
+
- general_information
|
|
487
|
+
environment:
|
|
488
|
+
main_name: exact_match
|
|
489
|
+
main_split: test
|
|
490
|
+
taxonomy:
|
|
491
|
+
task: multiple-choice question answering
|
|
492
|
+
what: computer security
|
|
493
|
+
who: various online sources
|
|
494
|
+
when: before 2021
|
|
495
|
+
language: English
|
|
496
|
+
|
|
497
|
+
- name: mmlu_econometrics
|
|
498
|
+
display_name: Econometrics
|
|
499
|
+
short_display_name: Econometrics
|
|
500
|
+
description: The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
501
|
+
metric_groups:
|
|
502
|
+
- accuracy
|
|
503
|
+
- efficiency
|
|
504
|
+
- general_information
|
|
505
|
+
environment:
|
|
506
|
+
main_name: exact_match
|
|
507
|
+
main_split: test
|
|
508
|
+
taxonomy:
|
|
509
|
+
task: multiple-choice question answering
|
|
510
|
+
what: econometrics
|
|
511
|
+
who: various online sources
|
|
512
|
+
when: before 2021
|
|
513
|
+
language: English
|
|
514
|
+
|
|
515
|
+
- name: mmlu_global_facts
|
|
516
|
+
display_name: Global Facts
|
|
517
|
+
short_display_name: Global Facts
|
|
518
|
+
description: The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
519
|
+
metric_groups:
|
|
520
|
+
- accuracy
|
|
521
|
+
- efficiency
|
|
522
|
+
- general_information
|
|
523
|
+
environment:
|
|
524
|
+
main_name: exact_match
|
|
525
|
+
main_split: test
|
|
526
|
+
taxonomy:
|
|
527
|
+
task: multiple-choice question answering
|
|
528
|
+
what: global facts
|
|
529
|
+
who: various online sources
|
|
530
|
+
when: before 2021
|
|
531
|
+
language: English
|
|
532
|
+
|
|
533
|
+
- name: mmlu_jurisprudence
|
|
534
|
+
display_name: Jurisprudence
|
|
535
|
+
short_display_name: Jurisprudence
|
|
536
|
+
description: The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
537
|
+
metric_groups:
|
|
538
|
+
- accuracy
|
|
539
|
+
- efficiency
|
|
540
|
+
- general_information
|
|
541
|
+
environment:
|
|
542
|
+
main_name: exact_match
|
|
543
|
+
main_split: test
|
|
544
|
+
taxonomy:
|
|
545
|
+
task: multiple-choice question answering
|
|
546
|
+
what: jurisprudence
|
|
547
|
+
who: various online sources
|
|
548
|
+
when: before 2021
|
|
549
|
+
language: English
|
|
550
|
+
|
|
551
|
+
- name: mmlu_philosophy
|
|
552
|
+
display_name: Philosophy
|
|
553
|
+
short_display_name: Philosophy
|
|
554
|
+
description: The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
555
|
+
metric_groups:
|
|
556
|
+
- accuracy
|
|
557
|
+
- efficiency
|
|
558
|
+
- general_information
|
|
559
|
+
environment:
|
|
560
|
+
main_name: exact_match
|
|
561
|
+
main_split: test
|
|
562
|
+
taxonomy:
|
|
563
|
+
task: multiple-choice question answering
|
|
564
|
+
what: philosophy
|
|
565
|
+
who: various online sources
|
|
566
|
+
when: before 2021
|
|
567
|
+
language: English
|
|
568
|
+
|
|
569
|
+
- name: mmlu_professional_medicine
|
|
570
|
+
display_name: Professional Medicine
|
|
571
|
+
short_display_name: Professional Medicine
|
|
572
|
+
description: The professional medicine subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
573
|
+
metric_groups:
|
|
574
|
+
- accuracy
|
|
575
|
+
- efficiency
|
|
576
|
+
- general_information
|
|
577
|
+
environment:
|
|
578
|
+
main_name: exact_match
|
|
579
|
+
main_split: test
|
|
580
|
+
taxonomy:
|
|
581
|
+
task: multiple-choice question answering
|
|
582
|
+
what: professional medicine
|
|
583
|
+
who: various online sources
|
|
584
|
+
when: before 2021
|
|
585
|
+
language: English
|
|
586
|
+
|
|
587
|
+
- name: mmlu_us_foreign_policy
|
|
588
|
+
display_name: Us Foreign Policy
|
|
589
|
+
short_display_name: Us Foreign Policy
|
|
590
|
+
description: The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
591
|
+
metric_groups:
|
|
592
|
+
- accuracy
|
|
593
|
+
- efficiency
|
|
594
|
+
- general_information
|
|
595
|
+
environment:
|
|
596
|
+
main_name: exact_match
|
|
597
|
+
main_split: test
|
|
598
|
+
taxonomy:
|
|
599
|
+
task: multiple-choice question answering
|
|
600
|
+
what: us foreign policy
|
|
601
|
+
who: various online sources
|
|
602
|
+
when: before 2021
|
|
603
|
+
language: English
|
|
604
|
+
|
|
605
|
+
- name: mmlu_astronomy
|
|
606
|
+
display_name: Astronomy
|
|
607
|
+
short_display_name: Astronomy
|
|
608
|
+
description: The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
609
|
+
metric_groups:
|
|
610
|
+
- accuracy
|
|
611
|
+
- efficiency
|
|
612
|
+
- general_information
|
|
613
|
+
environment:
|
|
614
|
+
main_name: exact_match
|
|
615
|
+
main_split: test
|
|
616
|
+
taxonomy:
|
|
617
|
+
task: multiple-choice question answering
|
|
618
|
+
what: astronomy
|
|
619
|
+
who: various online sources
|
|
620
|
+
when: before 2021
|
|
621
|
+
language: English
|
|
622
|
+
|
|
623
|
+
- name: mmlu_business_ethics
|
|
624
|
+
display_name: Business Ethics
|
|
625
|
+
short_display_name: Business Ethics
|
|
626
|
+
description: The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
627
|
+
metric_groups:
|
|
628
|
+
- accuracy
|
|
629
|
+
- efficiency
|
|
630
|
+
- general_information
|
|
631
|
+
environment:
|
|
632
|
+
main_name: exact_match
|
|
633
|
+
main_split: test
|
|
634
|
+
taxonomy:
|
|
635
|
+
task: multiple-choice question answering
|
|
636
|
+
what: business ethics
|
|
637
|
+
who: various online sources
|
|
638
|
+
when: before 2021
|
|
639
|
+
language: English
|
|
640
|
+
|
|
641
|
+
- name: mmlu_clinical_knowledge
|
|
642
|
+
display_name: Clinical Knowledge
|
|
643
|
+
short_display_name: Clinical Knowledge
|
|
644
|
+
description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
645
|
+
metric_groups:
|
|
646
|
+
- accuracy
|
|
647
|
+
- efficiency
|
|
648
|
+
- general_information
|
|
649
|
+
environment:
|
|
650
|
+
main_name: exact_match
|
|
651
|
+
main_split: test
|
|
652
|
+
taxonomy:
|
|
653
|
+
task: multiple-choice question answering
|
|
654
|
+
what: clinical knowledge
|
|
655
|
+
who: various online sources
|
|
656
|
+
when: before 2021
|
|
657
|
+
language: English
|
|
658
|
+
|
|
659
|
+
- name: mmlu_college_biology
|
|
660
|
+
display_name: College Biology
|
|
661
|
+
short_display_name: College Biology
|
|
662
|
+
description: The college biology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
663
|
+
metric_groups:
|
|
664
|
+
- accuracy
|
|
665
|
+
- efficiency
|
|
666
|
+
- general_information
|
|
667
|
+
environment:
|
|
668
|
+
main_name: exact_match
|
|
669
|
+
main_split: test
|
|
670
|
+
taxonomy:
|
|
671
|
+
task: multiple-choice question answering
|
|
672
|
+
what: college biology
|
|
673
|
+
who: various online sources
|
|
674
|
+
when: before 2021
|
|
675
|
+
language: English
|
|
676
|
+
|
|
677
|
+
- name: mmlu_college_computer_science
|
|
678
|
+
display_name: College Computer Science
|
|
679
|
+
short_display_name: College Computer Science
|
|
680
|
+
description: The college computer science subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
681
|
+
metric_groups:
|
|
682
|
+
- accuracy
|
|
683
|
+
- efficiency
|
|
684
|
+
- general_information
|
|
685
|
+
environment:
|
|
686
|
+
main_name: exact_match
|
|
687
|
+
main_split: test
|
|
688
|
+
taxonomy:
|
|
689
|
+
task: multiple-choice question answering
|
|
690
|
+
what: college computer science
|
|
691
|
+
who: various online sources
|
|
692
|
+
when: before 2021
|
|
693
|
+
language: English
|
|
694
|
+
|
|
695
|
+
- name: mmlu_college_mathematics
|
|
696
|
+
display_name: College Mathematics
|
|
697
|
+
short_display_name: College Mathematics
|
|
698
|
+
description: The college mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
699
|
+
metric_groups:
|
|
700
|
+
- accuracy
|
|
701
|
+
- efficiency
|
|
702
|
+
- general_information
|
|
703
|
+
environment:
|
|
704
|
+
main_name: exact_match
|
|
705
|
+
main_split: test
|
|
706
|
+
taxonomy:
|
|
707
|
+
task: multiple-choice question answering
|
|
708
|
+
what: college mathematics
|
|
709
|
+
who: various online sources
|
|
710
|
+
when: before 2021
|
|
711
|
+
language: English
|
|
712
|
+
|
|
713
|
+
- name: mmlu_college_medicine
|
|
714
|
+
display_name: College Medicine
|
|
715
|
+
short_display_name: College Medicine
|
|
716
|
+
description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
717
|
+
metric_groups:
|
|
718
|
+
- accuracy
|
|
719
|
+
- efficiency
|
|
720
|
+
- general_information
|
|
721
|
+
environment:
|
|
722
|
+
main_name: exact_match
|
|
723
|
+
main_split: test
|
|
724
|
+
taxonomy:
|
|
725
|
+
task: multiple-choice question answering
|
|
726
|
+
what: college medicine
|
|
727
|
+
who: various online sources
|
|
728
|
+
when: before 2021
|
|
729
|
+
language: English
|
|
730
|
+
|
|
731
|
+
- name: mmlu_college_physics
|
|
732
|
+
display_name: College Physics
|
|
733
|
+
short_display_name: College Physics
|
|
734
|
+
description: The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
735
|
+
metric_groups:
|
|
736
|
+
- accuracy
|
|
737
|
+
- efficiency
|
|
738
|
+
- general_information
|
|
739
|
+
environment:
|
|
740
|
+
main_name: exact_match
|
|
741
|
+
main_split: test
|
|
742
|
+
taxonomy:
|
|
743
|
+
task: multiple-choice question answering
|
|
744
|
+
what: college physics
|
|
745
|
+
who: various online sources
|
|
746
|
+
when: before 2021
|
|
747
|
+
language: English
|
|
748
|
+
|
|
749
|
+
- name: mmlu_conceptual_physics
|
|
750
|
+
display_name: Conceptual Physics
|
|
751
|
+
short_display_name: Conceptual Physics
|
|
752
|
+
description: The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
753
|
+
metric_groups:
|
|
754
|
+
- accuracy
|
|
755
|
+
- efficiency
|
|
756
|
+
- general_information
|
|
757
|
+
environment:
|
|
758
|
+
main_name: exact_match
|
|
759
|
+
main_split: test
|
|
760
|
+
taxonomy:
|
|
761
|
+
task: multiple-choice question answering
|
|
762
|
+
what: conceptual physics
|
|
763
|
+
who: various online sources
|
|
764
|
+
when: before 2021
|
|
765
|
+
language: English
|
|
766
|
+
|
|
767
|
+
- name: mmlu_electrical_engineering
|
|
768
|
+
display_name: Electrical Engineering
|
|
769
|
+
short_display_name: Electrical Engineering
|
|
770
|
+
description: The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
771
|
+
metric_groups:
|
|
772
|
+
- accuracy
|
|
773
|
+
- efficiency
|
|
774
|
+
- general_information
|
|
775
|
+
environment:
|
|
776
|
+
main_name: exact_match
|
|
777
|
+
main_split: test
|
|
778
|
+
taxonomy:
|
|
779
|
+
task: multiple-choice question answering
|
|
780
|
+
what: electrical engineering
|
|
781
|
+
who: various online sources
|
|
782
|
+
when: before 2021
|
|
783
|
+
language: English
|
|
784
|
+
|
|
785
|
+
- name: mmlu_elementary_mathematics
|
|
786
|
+
display_name: Elementary Mathematics
|
|
787
|
+
short_display_name: Elementary Mathematics
|
|
788
|
+
description: The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
789
|
+
metric_groups:
|
|
790
|
+
- accuracy
|
|
791
|
+
- efficiency
|
|
792
|
+
- general_information
|
|
793
|
+
environment:
|
|
794
|
+
main_name: exact_match
|
|
795
|
+
main_split: test
|
|
796
|
+
taxonomy:
|
|
797
|
+
task: multiple-choice question answering
|
|
798
|
+
what: elementary mathematics
|
|
799
|
+
who: various online sources
|
|
800
|
+
when: before 2021
|
|
801
|
+
language: English
|
|
802
|
+
|
|
803
|
+
- name: mmlu_formal_logic
|
|
804
|
+
display_name: Formal Logic
|
|
805
|
+
short_display_name: Formal Logic
|
|
806
|
+
description: The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
807
|
+
metric_groups:
|
|
808
|
+
- accuracy
|
|
809
|
+
- efficiency
|
|
810
|
+
- general_information
|
|
811
|
+
environment:
|
|
812
|
+
main_name: exact_match
|
|
813
|
+
main_split: test
|
|
814
|
+
taxonomy:
|
|
815
|
+
task: multiple-choice question answering
|
|
816
|
+
what: formal logic
|
|
817
|
+
who: various online sources
|
|
818
|
+
when: before 2021
|
|
819
|
+
language: English
|
|
820
|
+
|
|
821
|
+
- name: mmlu_high_school_biology
|
|
822
|
+
display_name: High School Biology
|
|
823
|
+
short_display_name: High School Biology
|
|
824
|
+
description: The high school biology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
825
|
+
metric_groups:
|
|
826
|
+
- accuracy
|
|
827
|
+
- efficiency
|
|
828
|
+
- general_information
|
|
829
|
+
environment:
|
|
830
|
+
main_name: exact_match
|
|
831
|
+
main_split: test
|
|
832
|
+
taxonomy:
|
|
833
|
+
task: multiple-choice question answering
|
|
834
|
+
what: high school biology
|
|
835
|
+
who: various online sources
|
|
836
|
+
when: before 2021
|
|
837
|
+
language: English
|
|
838
|
+
|
|
839
|
+
- name: mmlu_high_school_chemistry
|
|
840
|
+
display_name: High School Chemistry
|
|
841
|
+
short_display_name: High School Chemistry
|
|
842
|
+
description: The high school chemistry subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
843
|
+
metric_groups:
|
|
844
|
+
- accuracy
|
|
845
|
+
- efficiency
|
|
846
|
+
- general_information
|
|
847
|
+
environment:
|
|
848
|
+
main_name: exact_match
|
|
849
|
+
main_split: test
|
|
850
|
+
taxonomy:
|
|
851
|
+
task: multiple-choice question answering
|
|
852
|
+
what: high school chemistry
|
|
853
|
+
who: various online sources
|
|
854
|
+
when: before 2021
|
|
855
|
+
language: English
|
|
856
|
+
|
|
857
|
+
- name: mmlu_high_school_computer_science
|
|
858
|
+
display_name: High School Computer Science
|
|
859
|
+
short_display_name: High School Computer Science
|
|
860
|
+
description: The high school computer science subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
861
|
+
metric_groups:
|
|
862
|
+
- accuracy
|
|
863
|
+
- efficiency
|
|
864
|
+
- general_information
|
|
865
|
+
environment:
|
|
866
|
+
main_name: exact_match
|
|
867
|
+
main_split: test
|
|
868
|
+
taxonomy:
|
|
869
|
+
task: multiple-choice question answering
|
|
870
|
+
what: high school computer science
|
|
871
|
+
who: various online sources
|
|
872
|
+
when: before 2021
|
|
873
|
+
language: English
|
|
874
|
+
|
|
875
|
+
- name: mmlu_high_school_european_history
|
|
876
|
+
display_name: High School European History
|
|
877
|
+
short_display_name: High School European History
|
|
878
|
+
description: The high school european history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
879
|
+
metric_groups:
|
|
880
|
+
- accuracy
|
|
881
|
+
- efficiency
|
|
882
|
+
- general_information
|
|
883
|
+
environment:
|
|
884
|
+
main_name: exact_match
|
|
885
|
+
main_split: test
|
|
886
|
+
taxonomy:
|
|
887
|
+
task: multiple-choice question answering
|
|
888
|
+
what: high school european history
|
|
889
|
+
who: various online sources
|
|
890
|
+
when: before 2021
|
|
891
|
+
language: English
|
|
892
|
+
|
|
893
|
+
- name: mmlu_high_school_geography
|
|
894
|
+
display_name: High School Geography
|
|
895
|
+
short_display_name: High School Geography
|
|
896
|
+
description: The high school geography subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
897
|
+
metric_groups:
|
|
898
|
+
- accuracy
|
|
899
|
+
- efficiency
|
|
900
|
+
- general_information
|
|
901
|
+
environment:
|
|
902
|
+
main_name: exact_match
|
|
903
|
+
main_split: test
|
|
904
|
+
taxonomy:
|
|
905
|
+
task: multiple-choice question answering
|
|
906
|
+
what: high school geography
|
|
907
|
+
who: various online sources
|
|
908
|
+
when: before 2021
|
|
909
|
+
language: English
|
|
910
|
+
|
|
911
|
+
- name: mmlu_high_school_government_and_politics
|
|
912
|
+
display_name: High School Government And Politics
|
|
913
|
+
short_display_name: High School Government And Politics
|
|
914
|
+
description: The high school government and politics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
915
|
+
metric_groups:
|
|
916
|
+
- accuracy
|
|
917
|
+
- efficiency
|
|
918
|
+
- general_information
|
|
919
|
+
environment:
|
|
920
|
+
main_name: exact_match
|
|
921
|
+
main_split: test
|
|
922
|
+
taxonomy:
|
|
923
|
+
task: multiple-choice question answering
|
|
924
|
+
what: high school government and politics
|
|
925
|
+
who: various online sources
|
|
926
|
+
when: before 2021
|
|
927
|
+
language: English
|
|
928
|
+
|
|
929
|
+
- name: mmlu_high_school_macroeconomics
|
|
930
|
+
display_name: High School Macroeconomics
|
|
931
|
+
short_display_name: High School Macroeconomics
|
|
932
|
+
description: The high school macroeconomics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
933
|
+
metric_groups:
|
|
934
|
+
- accuracy
|
|
935
|
+
- efficiency
|
|
936
|
+
- general_information
|
|
937
|
+
environment:
|
|
938
|
+
main_name: exact_match
|
|
939
|
+
main_split: test
|
|
940
|
+
taxonomy:
|
|
941
|
+
task: multiple-choice question answering
|
|
942
|
+
what: high school macroeconomics
|
|
943
|
+
who: various online sources
|
|
944
|
+
when: before 2021
|
|
945
|
+
language: English
|
|
946
|
+
|
|
947
|
+
- name: mmlu_high_school_mathematics
|
|
948
|
+
display_name: High School Mathematics
|
|
949
|
+
short_display_name: High School Mathematics
|
|
950
|
+
description: The high school mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
951
|
+
metric_groups:
|
|
952
|
+
- accuracy
|
|
953
|
+
- efficiency
|
|
954
|
+
- general_information
|
|
955
|
+
environment:
|
|
956
|
+
main_name: exact_match
|
|
957
|
+
main_split: test
|
|
958
|
+
taxonomy:
|
|
959
|
+
task: multiple-choice question answering
|
|
960
|
+
what: high school mathematics
|
|
961
|
+
who: various online sources
|
|
962
|
+
when: before 2021
|
|
963
|
+
language: English
|
|
964
|
+
|
|
965
|
+
- name: mmlu_high_school_microeconomics
|
|
966
|
+
display_name: High School Microeconomics
|
|
967
|
+
short_display_name: High School Microeconomics
|
|
968
|
+
description: The high school microeconomics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
969
|
+
metric_groups:
|
|
970
|
+
- accuracy
|
|
971
|
+
- efficiency
|
|
972
|
+
- general_information
|
|
973
|
+
environment:
|
|
974
|
+
main_name: exact_match
|
|
975
|
+
main_split: test
|
|
976
|
+
taxonomy:
|
|
977
|
+
task: multiple-choice question answering
|
|
978
|
+
what: high school microeconomics
|
|
979
|
+
who: various online sources
|
|
980
|
+
when: before 2021
|
|
981
|
+
language: English
|
|
982
|
+
|
|
983
|
+
- name: mmlu_high_school_physics
|
|
984
|
+
display_name: High School Physics
|
|
985
|
+
short_display_name: High School Physics
|
|
986
|
+
description: The high school physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
987
|
+
metric_groups:
|
|
988
|
+
- accuracy
|
|
989
|
+
- efficiency
|
|
990
|
+
- general_information
|
|
991
|
+
environment:
|
|
992
|
+
main_name: exact_match
|
|
993
|
+
main_split: test
|
|
994
|
+
taxonomy:
|
|
995
|
+
task: multiple-choice question answering
|
|
996
|
+
what: high school physics
|
|
997
|
+
who: various online sources
|
|
998
|
+
when: before 2021
|
|
999
|
+
language: English
|
|
1000
|
+
|
|
1001
|
+
- name: mmlu_high_school_psychology
|
|
1002
|
+
display_name: High School Psychology
|
|
1003
|
+
short_display_name: High School Psychology
|
|
1004
|
+
description: The high school psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1005
|
+
metric_groups:
|
|
1006
|
+
- accuracy
|
|
1007
|
+
- efficiency
|
|
1008
|
+
- general_information
|
|
1009
|
+
environment:
|
|
1010
|
+
main_name: exact_match
|
|
1011
|
+
main_split: test
|
|
1012
|
+
taxonomy:
|
|
1013
|
+
task: multiple-choice question answering
|
|
1014
|
+
what: high school psychology
|
|
1015
|
+
who: various online sources
|
|
1016
|
+
when: before 2021
|
|
1017
|
+
language: English
|
|
1018
|
+
|
|
1019
|
+
- name: mmlu_high_school_statistics
|
|
1020
|
+
display_name: High School Statistics
|
|
1021
|
+
short_display_name: High School Statistics
|
|
1022
|
+
description: The high school statistics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1023
|
+
metric_groups:
|
|
1024
|
+
- accuracy
|
|
1025
|
+
- efficiency
|
|
1026
|
+
- general_information
|
|
1027
|
+
environment:
|
|
1028
|
+
main_name: exact_match
|
|
1029
|
+
main_split: test
|
|
1030
|
+
taxonomy:
|
|
1031
|
+
task: multiple-choice question answering
|
|
1032
|
+
what: high school statistics
|
|
1033
|
+
who: various online sources
|
|
1034
|
+
when: before 2021
|
|
1035
|
+
language: English
|
|
1036
|
+
|
|
1037
|
+
- name: mmlu_high_school_us_history
|
|
1038
|
+
display_name: High School US History
|
|
1039
|
+
short_display_name: High School US History
|
|
1040
|
+
description: The high school us history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1041
|
+
metric_groups:
|
|
1042
|
+
- accuracy
|
|
1043
|
+
- efficiency
|
|
1044
|
+
- general_information
|
|
1045
|
+
environment:
|
|
1046
|
+
main_name: exact_match
|
|
1047
|
+
main_split: test
|
|
1048
|
+
taxonomy:
|
|
1049
|
+
task: multiple-choice question answering
|
|
1050
|
+
what: high school us history
|
|
1051
|
+
who: various online sources
|
|
1052
|
+
when: before 2021
|
|
1053
|
+
language: English
|
|
1054
|
+
|
|
1055
|
+
- name: mmlu_high_school_world_history
|
|
1056
|
+
display_name: High School World History
|
|
1057
|
+
short_display_name: High School World History
|
|
1058
|
+
description: The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1059
|
+
metric_groups:
|
|
1060
|
+
- accuracy
|
|
1061
|
+
- efficiency
|
|
1062
|
+
- general_information
|
|
1063
|
+
environment:
|
|
1064
|
+
main_name: exact_match
|
|
1065
|
+
main_split: test
|
|
1066
|
+
taxonomy:
|
|
1067
|
+
task: multiple-choice question answering
|
|
1068
|
+
what: high school world history
|
|
1069
|
+
who: various online sources
|
|
1070
|
+
when: before 2021
|
|
1071
|
+
language: English
|
|
1072
|
+
|
|
1073
|
+
- name: mmlu_human_aging
|
|
1074
|
+
display_name: Human Aging
|
|
1075
|
+
short_display_name: Human Aging
|
|
1076
|
+
description: The human aging subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1077
|
+
metric_groups:
|
|
1078
|
+
- accuracy
|
|
1079
|
+
- efficiency
|
|
1080
|
+
- general_information
|
|
1081
|
+
environment:
|
|
1082
|
+
main_name: exact_match
|
|
1083
|
+
main_split: test
|
|
1084
|
+
taxonomy:
|
|
1085
|
+
task: multiple-choice question answering
|
|
1086
|
+
what: human aging
|
|
1087
|
+
who: various online sources
|
|
1088
|
+
when: before 2021
|
|
1089
|
+
language: English
|
|
1090
|
+
|
|
1091
|
+
- name: mmlu_human_sexuality
|
|
1092
|
+
display_name: Human Sexuality
|
|
1093
|
+
short_display_name: Human Sexuality
|
|
1094
|
+
description: The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1095
|
+
metric_groups:
|
|
1096
|
+
- accuracy
|
|
1097
|
+
- efficiency
|
|
1098
|
+
- general_information
|
|
1099
|
+
environment:
|
|
1100
|
+
main_name: exact_match
|
|
1101
|
+
main_split: test
|
|
1102
|
+
taxonomy:
|
|
1103
|
+
task: multiple-choice question answering
|
|
1104
|
+
what: human sexuality
|
|
1105
|
+
who: various online sources
|
|
1106
|
+
when: before 2021
|
|
1107
|
+
language: English
|
|
1108
|
+
|
|
1109
|
+
- name: mmlu_international_law
|
|
1110
|
+
display_name: International Law
|
|
1111
|
+
short_display_name: International Law
|
|
1112
|
+
description: The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1113
|
+
metric_groups:
|
|
1114
|
+
- accuracy
|
|
1115
|
+
- efficiency
|
|
1116
|
+
- general_information
|
|
1117
|
+
environment:
|
|
1118
|
+
main_name: exact_match
|
|
1119
|
+
main_split: test
|
|
1120
|
+
taxonomy:
|
|
1121
|
+
task: multiple-choice question answering
|
|
1122
|
+
what: international law
|
|
1123
|
+
who: various online sources
|
|
1124
|
+
when: before 2021
|
|
1125
|
+
language: English
|
|
1126
|
+
|
|
1127
|
+
- name: mmlu_logical_fallacies
|
|
1128
|
+
display_name: Logical Fallacies
|
|
1129
|
+
short_display_name: Logical Fallacies
|
|
1130
|
+
description: The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1131
|
+
metric_groups:
|
|
1132
|
+
- accuracy
|
|
1133
|
+
- efficiency
|
|
1134
|
+
- general_information
|
|
1135
|
+
environment:
|
|
1136
|
+
main_name: exact_match
|
|
1137
|
+
main_split: test
|
|
1138
|
+
taxonomy:
|
|
1139
|
+
task: multiple-choice question answering
|
|
1140
|
+
what: logical fallacies
|
|
1141
|
+
who: various online sources
|
|
1142
|
+
when: before 2021
|
|
1143
|
+
language: English
|
|
1144
|
+
|
|
1145
|
+
- name: mmlu_machine_learning
|
|
1146
|
+
display_name: Machine Learning
|
|
1147
|
+
short_display_name: Machine Learning
|
|
1148
|
+
description: The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1149
|
+
metric_groups:
|
|
1150
|
+
- accuracy
|
|
1151
|
+
- efficiency
|
|
1152
|
+
- general_information
|
|
1153
|
+
environment:
|
|
1154
|
+
main_name: exact_match
|
|
1155
|
+
main_split: test
|
|
1156
|
+
taxonomy:
|
|
1157
|
+
task: multiple-choice question answering
|
|
1158
|
+
what: machine learning
|
|
1159
|
+
who: various online sources
|
|
1160
|
+
when: before 2021
|
|
1161
|
+
language: English
|
|
1162
|
+
|
|
1163
|
+
- name: mmlu_management
|
|
1164
|
+
display_name: Management
|
|
1165
|
+
short_display_name: Management
|
|
1166
|
+
description: The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1167
|
+
metric_groups:
|
|
1168
|
+
- accuracy
|
|
1169
|
+
- efficiency
|
|
1170
|
+
- general_information
|
|
1171
|
+
environment:
|
|
1172
|
+
main_name: exact_match
|
|
1173
|
+
main_split: test
|
|
1174
|
+
taxonomy:
|
|
1175
|
+
task: multiple-choice question answering
|
|
1176
|
+
what: management
|
|
1177
|
+
who: various online sources
|
|
1178
|
+
when: before 2021
|
|
1179
|
+
language: English
|
|
1180
|
+
|
|
1181
|
+
- name: mmlu_marketing
|
|
1182
|
+
display_name: Marketing
|
|
1183
|
+
short_display_name: Marketing
|
|
1184
|
+
description: The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1185
|
+
metric_groups:
|
|
1186
|
+
- accuracy
|
|
1187
|
+
- efficiency
|
|
1188
|
+
- general_information
|
|
1189
|
+
environment:
|
|
1190
|
+
main_name: exact_match
|
|
1191
|
+
main_split: test
|
|
1192
|
+
taxonomy:
|
|
1193
|
+
task: multiple-choice question answering
|
|
1194
|
+
what: marketing
|
|
1195
|
+
who: various online sources
|
|
1196
|
+
when: before 2021
|
|
1197
|
+
language: English
|
|
1198
|
+
|
|
1199
|
+
- name: mmlu_medical_genetics
|
|
1200
|
+
display_name: Medical Genetics
|
|
1201
|
+
short_display_name: Medical Genetics
|
|
1202
|
+
description: The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1203
|
+
metric_groups:
|
|
1204
|
+
- accuracy
|
|
1205
|
+
- efficiency
|
|
1206
|
+
- general_information
|
|
1207
|
+
environment:
|
|
1208
|
+
main_name: exact_match
|
|
1209
|
+
main_split: test
|
|
1210
|
+
taxonomy:
|
|
1211
|
+
task: multiple-choice question answering
|
|
1212
|
+
what: medical genetics
|
|
1213
|
+
who: various online sources
|
|
1214
|
+
when: before 2021
|
|
1215
|
+
language: English
|
|
1216
|
+
|
|
1217
|
+
- name: mmlu_miscellaneous
|
|
1218
|
+
display_name: Miscellaneous
|
|
1219
|
+
short_display_name: Miscellaneous
|
|
1220
|
+
description: The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1221
|
+
metric_groups:
|
|
1222
|
+
- accuracy
|
|
1223
|
+
- efficiency
|
|
1224
|
+
- general_information
|
|
1225
|
+
environment:
|
|
1226
|
+
main_name: exact_match
|
|
1227
|
+
main_split: test
|
|
1228
|
+
taxonomy:
|
|
1229
|
+
task: multiple-choice question answering
|
|
1230
|
+
what: miscellaneous
|
|
1231
|
+
who: various online sources
|
|
1232
|
+
when: before 2021
|
|
1233
|
+
language: English
|
|
1234
|
+
|
|
1235
|
+
- name: mmlu_moral_disputes
|
|
1236
|
+
display_name: Moral Disputes
|
|
1237
|
+
short_display_name: Moral Disputes
|
|
1238
|
+
description: The moral disputes subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1239
|
+
metric_groups:
|
|
1240
|
+
- accuracy
|
|
1241
|
+
- efficiency
|
|
1242
|
+
- general_information
|
|
1243
|
+
environment:
|
|
1244
|
+
main_name: exact_match
|
|
1245
|
+
main_split: test
|
|
1246
|
+
taxonomy:
|
|
1247
|
+
task: multiple-choice question answering
|
|
1248
|
+
what: moral disputes
|
|
1249
|
+
who: various online sources
|
|
1250
|
+
when: before 2021
|
|
1251
|
+
language: English
|
|
1252
|
+
|
|
1253
|
+
- name: mmlu_moral_scenarios
|
|
1254
|
+
display_name: Moral Scenarios
|
|
1255
|
+
short_display_name: Moral Scenarios
|
|
1256
|
+
description: The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1257
|
+
metric_groups:
|
|
1258
|
+
- accuracy
|
|
1259
|
+
- efficiency
|
|
1260
|
+
- general_information
|
|
1261
|
+
environment:
|
|
1262
|
+
main_name: exact_match
|
|
1263
|
+
main_split: test
|
|
1264
|
+
taxonomy:
|
|
1265
|
+
task: multiple-choice question answering
|
|
1266
|
+
what: moral scenarios
|
|
1267
|
+
who: various online sources
|
|
1268
|
+
when: before 2021
|
|
1269
|
+
language: English
|
|
1270
|
+
|
|
1271
|
+
- name: mmlu_nutrition
|
|
1272
|
+
display_name: Nutrition
|
|
1273
|
+
short_display_name: Nutrition
|
|
1274
|
+
description: The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1275
|
+
metric_groups:
|
|
1276
|
+
- accuracy
|
|
1277
|
+
- efficiency
|
|
1278
|
+
- general_information
|
|
1279
|
+
environment:
|
|
1280
|
+
main_name: exact_match
|
|
1281
|
+
main_split: test
|
|
1282
|
+
taxonomy:
|
|
1283
|
+
task: multiple-choice question answering
|
|
1284
|
+
what: nutrition
|
|
1285
|
+
who: various online sources
|
|
1286
|
+
when: before 2021
|
|
1287
|
+
language: English
|
|
1288
|
+
|
|
1289
|
+
- name: mmlu_prehistory
|
|
1290
|
+
display_name: Prehistory
|
|
1291
|
+
short_display_name: Prehistory
|
|
1292
|
+
description: The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1293
|
+
metric_groups:
|
|
1294
|
+
- accuracy
|
|
1295
|
+
- efficiency
|
|
1296
|
+
- general_information
|
|
1297
|
+
environment:
|
|
1298
|
+
main_name: exact_match
|
|
1299
|
+
main_split: test
|
|
1300
|
+
taxonomy:
|
|
1301
|
+
task: multiple-choice question answering
|
|
1302
|
+
what: prehistory
|
|
1303
|
+
who: various online sources
|
|
1304
|
+
when: before 2021
|
|
1305
|
+
language: English
|
|
1306
|
+
|
|
1307
|
+
- name: mmlu_professional_accounting
|
|
1308
|
+
display_name: Professional Accounting
|
|
1309
|
+
short_display_name: Professional Accounting
|
|
1310
|
+
description: The professional accounting subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1311
|
+
metric_groups:
|
|
1312
|
+
- accuracy
|
|
1313
|
+
- efficiency
|
|
1314
|
+
- general_information
|
|
1315
|
+
environment:
|
|
1316
|
+
main_name: exact_match
|
|
1317
|
+
main_split: test
|
|
1318
|
+
taxonomy:
|
|
1319
|
+
task: multiple-choice question answering
|
|
1320
|
+
what: professional accounting
|
|
1321
|
+
who: various online sources
|
|
1322
|
+
when: before 2021
|
|
1323
|
+
language: English
|
|
1324
|
+
|
|
1325
|
+
- name: mmlu_professional_law
|
|
1326
|
+
display_name: Professional Law
|
|
1327
|
+
short_display_name: Professional Law
|
|
1328
|
+
description: The professional law subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1329
|
+
metric_groups:
|
|
1330
|
+
- accuracy
|
|
1331
|
+
- efficiency
|
|
1332
|
+
- general_information
|
|
1333
|
+
environment:
|
|
1334
|
+
main_name: exact_match
|
|
1335
|
+
main_split: test
|
|
1336
|
+
taxonomy:
|
|
1337
|
+
task: multiple-choice question answering
|
|
1338
|
+
what: professional law
|
|
1339
|
+
who: various online sources
|
|
1340
|
+
when: before 2021
|
|
1341
|
+
language: English
|
|
1342
|
+
|
|
1343
|
+
- name: mmlu_professional_psychology
|
|
1344
|
+
display_name: Professional Psychology
|
|
1345
|
+
short_display_name: Professional Psychology
|
|
1346
|
+
description: The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1347
|
+
metric_groups:
|
|
1348
|
+
- accuracy
|
|
1349
|
+
- efficiency
|
|
1350
|
+
- general_information
|
|
1351
|
+
environment:
|
|
1352
|
+
main_name: exact_match
|
|
1353
|
+
main_split: test
|
|
1354
|
+
taxonomy:
|
|
1355
|
+
task: multiple-choice question answering
|
|
1356
|
+
what: professional psychology
|
|
1357
|
+
who: various online sources
|
|
1358
|
+
when: before 2021
|
|
1359
|
+
language: English
|
|
1360
|
+
|
|
1361
|
+
- name: mmlu_public_relations
|
|
1362
|
+
display_name: Public Relations
|
|
1363
|
+
short_display_name: Public Relations
|
|
1364
|
+
description: The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1365
|
+
metric_groups:
|
|
1366
|
+
- accuracy
|
|
1367
|
+
- efficiency
|
|
1368
|
+
- general_information
|
|
1369
|
+
environment:
|
|
1370
|
+
main_name: exact_match
|
|
1371
|
+
main_split: test
|
|
1372
|
+
taxonomy:
|
|
1373
|
+
task: multiple-choice question answering
|
|
1374
|
+
what: public relations
|
|
1375
|
+
who: various online sources
|
|
1376
|
+
when: before 2021
|
|
1377
|
+
language: English
|
|
1378
|
+
|
|
1379
|
+
- name: mmlu_security_studies
|
|
1380
|
+
display_name: Security Studies
|
|
1381
|
+
short_display_name: Security Studies
|
|
1382
|
+
description: The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1383
|
+
metric_groups:
|
|
1384
|
+
- accuracy
|
|
1385
|
+
- efficiency
|
|
1386
|
+
- general_information
|
|
1387
|
+
environment:
|
|
1388
|
+
main_name: exact_match
|
|
1389
|
+
main_split: test
|
|
1390
|
+
taxonomy:
|
|
1391
|
+
task: multiple-choice question answering
|
|
1392
|
+
what: security studies
|
|
1393
|
+
who: various online sources
|
|
1394
|
+
when: before 2021
|
|
1395
|
+
language: English
|
|
1396
|
+
|
|
1397
|
+
- name: mmlu_sociology
|
|
1398
|
+
display_name: Sociology
|
|
1399
|
+
short_display_name: Sociology
|
|
1400
|
+
description: The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1401
|
+
metric_groups:
|
|
1402
|
+
- accuracy
|
|
1403
|
+
- efficiency
|
|
1404
|
+
- general_information
|
|
1405
|
+
environment:
|
|
1406
|
+
main_name: exact_match
|
|
1407
|
+
main_split: test
|
|
1408
|
+
taxonomy:
|
|
1409
|
+
task: multiple-choice question answering
|
|
1410
|
+
what: sociology
|
|
1411
|
+
who: various online sources
|
|
1412
|
+
when: before 2021
|
|
1413
|
+
language: English
|
|
1414
|
+
|
|
1415
|
+
- name: mmlu_virology
|
|
1416
|
+
display_name: Virology
|
|
1417
|
+
short_display_name: Virology
|
|
1418
|
+
description: The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1419
|
+
metric_groups:
|
|
1420
|
+
- accuracy
|
|
1421
|
+
- efficiency
|
|
1422
|
+
- general_information
|
|
1423
|
+
environment:
|
|
1424
|
+
main_name: exact_match
|
|
1425
|
+
main_split: test
|
|
1426
|
+
taxonomy:
|
|
1427
|
+
task: multiple-choice question answering
|
|
1428
|
+
what: virology
|
|
1429
|
+
who: various online sources
|
|
1430
|
+
when: before 2021
|
|
1431
|
+
language: English
|
|
1432
|
+
|
|
1433
|
+
- name: mmlu_world_religions
|
|
1434
|
+
display_name: World Religions
|
|
1435
|
+
short_display_name: World Religions
|
|
1436
|
+
description: The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.
|
|
1437
|
+
metric_groups:
|
|
1438
|
+
- accuracy
|
|
1439
|
+
- efficiency
|
|
1440
|
+
- general_information
|
|
1441
|
+
environment:
|
|
1442
|
+
main_name: exact_match
|
|
1443
|
+
main_split: test
|
|
1444
|
+
taxonomy:
|
|
1445
|
+
task: multiple-choice question answering
|
|
1446
|
+
what: world religions
|
|
1447
|
+
who: various online sources
|
|
1448
|
+
when: before 2021
|
|
1449
|
+
language: English
|