PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/benchmark/static/schema_mmlu_winogrande_afr.yaml ADDED Viewed

@@ -0,0 +1,1045 @@
+---
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+############################################################
+perturbations: []
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    aggregation_strategies:
+      - mean
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+############################################################
+run_groups:
+  - name: benchmarks
+    display_name: All Benchmarks
+    description: Results for scenarios translated to 11 African languages by human translators.
+    category: Top-level results
+    subgroups:
+      - winogrande_afr
+      - mmlu_clinical_afr_clinical_knowledge
+      - mmlu_clinical_afr_college_medicine
+      - mmlu_clinical_afr_virology
+  - name: benchmarks_af
+    display_name: Afrikaans Benchmarks
+    description: Results for scenarios translated to Afrikaans by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_af
+      - mmlu_clinical_afr_clinical_knowledge_af
+      - mmlu_clinical_afr_college_medicine_af
+      - mmlu_clinical_afr_virology_af
+  - name: benchmarks_am
+    display_name: Amharic Benchmarks
+    description: Results for scenarios translated to Amharic by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_am
+      - mmlu_clinical_afr_clinical_knowledge_am
+      - mmlu_clinical_afr_college_medicine_am
+      - mmlu_clinical_afr_virology_am
+  - name: benchmarks_bm
+    display_name: Bambara Benchmarks
+    description: Results for scenarios translated to Bambara by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_bm
+      - mmlu_clinical_afr_clinical_knowledge_bm
+      - mmlu_clinical_afr_college_medicine_bm
+      - mmlu_clinical_afr_virology_bm
+  - name: benchmarks_ig
+    display_name: Igbo Benchmarks
+    description: Results for scenarios translated to Igbo by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_ig
+      - mmlu_clinical_afr_clinical_knowledge_ig
+      - mmlu_clinical_afr_college_medicine_ig
+      - mmlu_clinical_afr_virology_ig
+  - name: benchmarks_nso
+    display_name: Sepedi Benchmarks
+    description: Results for scenarios translated to Sepedi by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_nso
+      - mmlu_clinical_afr_clinical_knowledge_nso
+      - mmlu_clinical_afr_college_medicine_nso
+      - mmlu_clinical_afr_virology_nso
+  - name: benchmarks_sn
+    display_name: Shona Benchmarks
+    description: Results for scenarios translated to Shona by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_sn
+      - mmlu_clinical_afr_clinical_knowledge_sn
+      - mmlu_clinical_afr_college_medicine_sn
+      - mmlu_clinical_afr_virology_sn
+  - name: benchmarks_st
+    display_name: Sesotho Benchmarks
+    description: Results for scenarios translated to Sesotho by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_st
+      - mmlu_clinical_afr_clinical_knowledge_st
+      - mmlu_clinical_afr_college_medicine_st
+      - mmlu_clinical_afr_virology_st
+  - name: benchmarks_tn
+    display_name: Setswana Benchmarks
+    description: Results for scenarios translated to Setswana by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_tn
+      - mmlu_clinical_afr_clinical_knowledge_tn
+      - mmlu_clinical_afr_college_medicine_tn
+      - mmlu_clinical_afr_virology_tn
+  - name: benchmarks_ts
+    display_name: Tsonga Benchmarks
+    description: Results for scenarios translated to Tsonga by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_ts
+      - mmlu_clinical_afr_clinical_knowledge_ts
+      - mmlu_clinical_afr_college_medicine_ts
+      - mmlu_clinical_afr_virology_ts
+  - name: benchmarks_xh
+    display_name: Xhosa Benchmarks
+    description: Results for scenarios translated to Xhosa by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_xh
+      - mmlu_clinical_afr_clinical_knowledge_xh
+      - mmlu_clinical_afr_college_medicine_xh
+      - mmlu_clinical_afr_virology_xh
+  - name: benchmarks_zu
+    display_name: Zulu Benchmarks
+    description: Results for scenarios translated to Zulu by human translators.
+    category: Results by language
+    subgroups:
+      - winogrande_afr_zu
+      - mmlu_clinical_afr_clinical_knowledge_zu
+      - mmlu_clinical_afr_college_medicine_zu
+      - mmlu_clinical_afr_virology_zu
+    display_name: WinoGrande
+    description: Results for WinoGrande by language.
+    category: Results by benchmark
+    subgroups:
+      - winogrande_afr_af
+      - winogrande_afr_am
+      - winogrande_afr_bm
+      - winogrande_afr_ig
+      - winogrande_afr_nso
+      - winogrande_afr_sn
+      - winogrande_afr_st
+      - winogrande_afr_tn
+      - winogrande_afr_ts
+      - winogrande_afr_xh
+      - winogrande_afr_zu
+  - name: mmlu_clinical_afr_clinical_knowledge_by_language
+    display_name: MMLU Clinical Knowledge
+    description: Results for MMLU Clinical Knowledge by language.
+    category: Results by benchmark
+    subgroups:
+      - mmlu_clinical_afr_clinical_knowledge_af
+      - mmlu_clinical_afr_clinical_knowledge_am
+      - mmlu_clinical_afr_clinical_knowledge_bm
+      - mmlu_clinical_afr_clinical_knowledge_ig
+      - mmlu_clinical_afr_clinical_knowledge_nso
+      - mmlu_clinical_afr_clinical_knowledge_sn
+      - mmlu_clinical_afr_clinical_knowledge_st
+      - mmlu_clinical_afr_clinical_knowledge_tn
+      - mmlu_clinical_afr_clinical_knowledge_ts
+      - mmlu_clinical_afr_clinical_knowledge_xh
+      - mmlu_clinical_afr_clinical_knowledge_zu
+  - name: mmlu_clinical_afr_college_medicine_by_language
+    display_name: MMLU College Medicine
+    description: Results for MMLU College Medicine by language.
+    category: Results by benchmark
+    subgroups:
+      - mmlu_clinical_afr_college_medicine_af
+      - mmlu_clinical_afr_college_medicine_am
+      - mmlu_clinical_afr_college_medicine_bm
+      - mmlu_clinical_afr_college_medicine_ig
+      - mmlu_clinical_afr_college_medicine_nso
+      - mmlu_clinical_afr_college_medicine_sn
+      - mmlu_clinical_afr_college_medicine_st
+      - mmlu_clinical_afr_college_medicine_tn
+      - mmlu_clinical_afr_college_medicine_ts
+      - mmlu_clinical_afr_college_medicine_xh
+      - mmlu_clinical_afr_college_medicine_zu
+  - name: mmlu_clinical_afr_virology_by_language
+    display_name: MMLU Virology
+    description: Results for MMLU Virology by language.
+    category: Results by benchmark
+    subgroups:
+      - mmlu_clinical_afr_virology_af
+      - mmlu_clinical_afr_virology_am
+      - mmlu_clinical_afr_virology_bm
+      - mmlu_clinical_afr_virology_ig
+      - mmlu_clinical_afr_virology_nso
+      - mmlu_clinical_afr_virology_sn
+      - mmlu_clinical_afr_virology_st
+      - mmlu_clinical_afr_virology_tn
+      - mmlu_clinical_afr_virology_ts
+      - mmlu_clinical_afr_virology_xh
+      - mmlu_clinical_afr_virology_zu
+  - name: winogrande_afr
+    display_name: Winogrande (11 African languages)
+    short_display_name: Winogrande (11 African languages)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to 11 African languages by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: 11 African languages
+  - name: winogrande_afr_af
+    display_name: Winogrande (Afrikaans)
+    short_display_name: Winogrande (Afrikaans)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Afrikaans by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Afrikaans
+  - name: winogrande_afr_am
+    display_name: Winogrande (Amharic)
+    short_display_name: Winogrande (Amharic)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Amharic by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Amharic
+  - name: winogrande_afr_bm
+    display_name: Winogrande (Bambara)
+    short_display_name: Winogrande (Bambara)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Bambara by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Bambara
+  - name: winogrande_afr_ig
+    display_name: Winogrande (Igbo)
+    short_display_name: Winogrande (Igbo)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Igbo by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Igbo
+  - name: winogrande_afr_nso
+    display_name: Winogrande (Sepedi)
+    short_display_name: Winogrande (Sepedi)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Sepedi by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Sepedi
+  - name: winogrande_afr_sn
+    display_name: Winogrande (Shona)
+    short_display_name: Winogrande (Shona)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Shona by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Shona
+  - name: winogrande_afr_st
+    display_name: Winogrande (Sesotho)
+    short_display_name: Winogrande (Sesotho)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Sesotho by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Sesotho
+  - name: winogrande_afr_tn
+    display_name: Winogrande (Setswana)
+    short_display_name: Winogrande (Setswana)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Setswana by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Setswana
+  - name: winogrande_afr_ts
+    display_name: Winogrande (Tsonga)
+    short_display_name: Winogrande (Tsonga)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Tsonga by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Tsonga
+  - name: winogrande_afr_xh
+    display_name: Winogrande (Xhosa)
+    short_display_name: Winogrande (Xhosa)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Xhosa by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Xhosa
+  - name: winogrande_afr_zu
+    display_name: Winogrande (Zulu)
+    short_display_name: Winogrande (Zulu)
+    description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Zulu by human translators
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice reasoning
+      what: pronoun resolution
+      who: workers on Amazon Mechanical Turk
+      when: before 2019
+      language: Zulu
+  - name: mmlu_clinical_afr_clinical_knowledge
+    display_name: MMLU Clinical Knowledge (11 African languages)
+    short_display_name: MMLU Clinical Knowledge (11 African languages)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to 11 African languages by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: 11 African languages
+  - name: mmlu_clinical_afr_clinical_knowledge_af
+    display_name: MMLU Clinical Knowledge (Afrikaans)
+    short_display_name: MMLU Clinical Knowledge (Afrikaans)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Afrikaans by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Afrikaans
+  - name: mmlu_clinical_afr_clinical_knowledge_am
+    display_name: MMLU Clinical Knowledge (Amharic)
+    short_display_name: MMLU Clinical Knowledge (Amharic)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Amharic by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Amharic
+  - name: mmlu_clinical_afr_clinical_knowledge_bm
+    display_name: MMLU Clinical Knowledge (Bambara)
+    short_display_name: MMLU Clinical Knowledge (Bambara)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Bambara by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Bambara
+  - name: mmlu_clinical_afr_clinical_knowledge_ig
+    display_name: MMLU Clinical Knowledge (Igbo)
+    short_display_name: MMLU Clinical Knowledge (Igbo)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Igbo by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Igbo
+  - name: mmlu_clinical_afr_clinical_knowledge_nso
+    display_name: MMLU Clinical Knowledge (Sepedi)
+    short_display_name: MMLU Clinical Knowledge (Sepedi)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sepedi by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Sepedi
+  - name: mmlu_clinical_afr_clinical_knowledge_sn
+    display_name: MMLU Clinical Knowledge (Shona)
+    short_display_name: MMLU Clinical Knowledge (Shona)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Shona by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Shona
+  - name: mmlu_clinical_afr_clinical_knowledge_st
+    display_name: MMLU Clinical Knowledge (Sesotho)
+    short_display_name: MMLU Clinical Knowledge (Sesotho)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sesotho by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Sesotho
+  - name: mmlu_clinical_afr_clinical_knowledge_tn
+    display_name: MMLU Clinical Knowledge (Setswana)
+    short_display_name: MMLU Clinical Knowledge (Setswana)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Setswana by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Setswana
+  - name: mmlu_clinical_afr_clinical_knowledge_ts
+    display_name: MMLU Clinical Knowledge (Tsonga)
+    short_display_name: MMLU Clinical Knowledge (Tsonga)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Tsonga by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Tsonga
+  - name: mmlu_clinical_afr_clinical_knowledge_xh
+    display_name: MMLU Clinical Knowledge (Xhosa)
+    short_display_name: MMLU Clinical Knowledge (Xhosa)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Xhosa by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Xhosa
+  - name: mmlu_clinical_afr_clinical_knowledge_zu
+    display_name: MMLU Clinical Knowledge (Zulu)
+    short_display_name: MMLU Clinical Knowledge (Zulu)
+    description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Zulu by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: clinical knowledge
+      who: various online sources
+      when: before 2021
+      language: Zulu
+  - name: mmlu_clinical_afr_college_medicine
+    display_name: MMLU College Medicine (11 African languages)
+    short_display_name: MMLU College Medicine (11 African languages)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to 11 African languages by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: 11 African languages
+  - name: mmlu_clinical_afr_college_medicine_af
+    display_name: MMLU College Medicine (Afrikaans)
+    short_display_name: MMLU College Medicine (Afrikaans)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Afrikaans by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Afrikaans
+  - name: mmlu_clinical_afr_college_medicine_am
+    display_name: MMLU College Medicine (Amharic)
+    short_display_name: MMLU College Medicine (Amharic)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Amharic by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Amharic
+  - name: mmlu_clinical_afr_college_medicine_bm
+    display_name: MMLU College Medicine (Bambara)
+    short_display_name: MMLU College Medicine (Bambara)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Bambara by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Bambara
+  - name: mmlu_clinical_afr_college_medicine_ig
+    display_name: MMLU College Medicine (Igbo)
+    short_display_name: MMLU College Medicine (Igbo)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Igbo by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Igbo
+  - name: mmlu_clinical_afr_college_medicine_nso
+    display_name: MMLU College Medicine (Sepedi)
+    short_display_name: MMLU College Medicine (Sepedi)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sepedi by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Sepedi
+  - name: mmlu_clinical_afr_college_medicine_sn
+    display_name: MMLU College Medicine (Shona)
+    short_display_name: MMLU College Medicine (Shona)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Shona by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Shona
+  - name: mmlu_clinical_afr_college_medicine_st
+    display_name: MMLU College Medicine (Sesotho)
+    short_display_name: MMLU College Medicine (Sesotho)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sesotho by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Sesotho
+  - name: mmlu_clinical_afr_college_medicine_tn
+    display_name: MMLU College Medicine (Setswana)
+    short_display_name: MMLU College Medicine (Setswana)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Setswana by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Setswana
+  - name: mmlu_clinical_afr_college_medicine_ts
+    display_name: MMLU College Medicine (Tsonga)
+    short_display_name: MMLU College Medicine (Tsonga)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Tsonga by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Tsonga
+  - name: mmlu_clinical_afr_college_medicine_xh
+    display_name: MMLU College Medicine (Xhosa)
+    short_display_name: MMLU College Medicine (Xhosa)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Xhosa by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Xhosa
+  - name: mmlu_clinical_afr_college_medicine_zu
+    display_name: MMLU College Medicine (Zulu)
+    short_display_name: MMLU College Medicine (Zulu)
+    description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Zulu by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: college medicine
+      who: various online sources
+      when: before 2021
+      language: Zulu
+  - name: mmlu_clinical_afr_virology
+    display_name: MMLU Virology (11 African languages)
+    short_display_name: MMLU Virology (11 African languages)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to 11 African languages by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: 11 African languages
+  - name: mmlu_clinical_afr_virology_af
+    display_name: MMLU Virology (Afrikaans)
+    short_display_name: MMLU Virology (Afrikaans)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Afrikaans by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Afrikaans
+  - name: mmlu_clinical_afr_virology_am
+    display_name: MMLU Virology (Amharic)
+    short_display_name: MMLU Virology (Amharic)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Amharic by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Amharic
+  - name: mmlu_clinical_afr_virology_bm
+    display_name: MMLU Virology (Bambara)
+    short_display_name: MMLU Virology (Bambara)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Bambara by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Bambara
+  - name: mmlu_clinical_afr_virology_ig
+    display_name: MMLU Virology (Igbo)
+    short_display_name: MMLU Virology (Igbo)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Igbo by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Igbo
+  - name: mmlu_clinical_afr_virology_nso
+    display_name: MMLU Virology (Sepedi)
+    short_display_name: MMLU Virology (Sepedi)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sepedi by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Sepedi
+  - name: mmlu_clinical_afr_virology_sn
+    display_name: MMLU Virology (Shona)
+    short_display_name: MMLU Virology (Shona)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Shona by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Shona
+  - name: mmlu_clinical_afr_virology_st
+    display_name: MMLU Virology (Sesotho)
+    short_display_name: MMLU Virology (Sesotho)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sesotho by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Sesotho
+  - name: mmlu_clinical_afr_virology_tn
+    display_name: MMLU Virology (Setswana)
+    short_display_name: MMLU Virology (Setswana)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Setswana by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Setswana
+  - name: mmlu_clinical_afr_virology_ts
+    display_name: MMLU Virology (Tsonga)
+    short_display_name: MMLU Virology (Tsonga)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Tsonga by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Tsonga
+  - name: mmlu_clinical_afr_virology_xh
+    display_name: MMLU Virology (Xhosa)
+    short_display_name: MMLU Virology (Xhosa)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Xhosa by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Xhosa
+  - name: mmlu_clinical_afr_virology_zu
+    display_name: MMLU Virology (Zulu)
+    short_display_name: MMLU Virology (Zulu)
+    description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Zulu by human translators.
+    metric_groups:
+      - accuracy
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: virology
+      who: various online sources
+      when: before 2021
+      language: Zulu

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl