crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1045 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+
74
+ ############################################################
75
+ perturbations: []
76
+
77
+ ############################################################
78
+ metric_groups:
79
+ - name: accuracy
80
+ display_name: Accuracy
81
+ aggregation_strategies:
82
+ - mean
83
+ metrics:
84
+ - name: ${main_name}
85
+ split: ${main_split}
86
+
87
+ ############################################################
88
+
89
+ run_groups:
90
+
91
+ - name: benchmarks
92
+ display_name: All Benchmarks
93
+ description: Results for scenarios translated to 11 African languages by human translators.
94
+ category: Top-level results
95
+ subgroups:
96
+ - winogrande_afr
97
+ - mmlu_clinical_afr_clinical_knowledge
98
+ - mmlu_clinical_afr_college_medicine
99
+ - mmlu_clinical_afr_virology
100
+
101
+ - name: benchmarks_af
102
+ display_name: Afrikaans Benchmarks
103
+ description: Results for scenarios translated to Afrikaans by human translators.
104
+ category: Results by language
105
+ subgroups:
106
+ - winogrande_afr_af
107
+ - mmlu_clinical_afr_clinical_knowledge_af
108
+ - mmlu_clinical_afr_college_medicine_af
109
+ - mmlu_clinical_afr_virology_af
110
+
111
+ - name: benchmarks_am
112
+ display_name: Amharic Benchmarks
113
+ description: Results for scenarios translated to Amharic by human translators.
114
+ category: Results by language
115
+ subgroups:
116
+ - winogrande_afr_am
117
+ - mmlu_clinical_afr_clinical_knowledge_am
118
+ - mmlu_clinical_afr_college_medicine_am
119
+ - mmlu_clinical_afr_virology_am
120
+
121
+ - name: benchmarks_bm
122
+ display_name: Bambara Benchmarks
123
+ description: Results for scenarios translated to Bambara by human translators.
124
+ category: Results by language
125
+ subgroups:
126
+ - winogrande_afr_bm
127
+ - mmlu_clinical_afr_clinical_knowledge_bm
128
+ - mmlu_clinical_afr_college_medicine_bm
129
+ - mmlu_clinical_afr_virology_bm
130
+
131
+ - name: benchmarks_ig
132
+ display_name: Igbo Benchmarks
133
+ description: Results for scenarios translated to Igbo by human translators.
134
+ category: Results by language
135
+ subgroups:
136
+ - winogrande_afr_ig
137
+ - mmlu_clinical_afr_clinical_knowledge_ig
138
+ - mmlu_clinical_afr_college_medicine_ig
139
+ - mmlu_clinical_afr_virology_ig
140
+
141
+ - name: benchmarks_nso
142
+ display_name: Sepedi Benchmarks
143
+ description: Results for scenarios translated to Sepedi by human translators.
144
+ category: Results by language
145
+ subgroups:
146
+ - winogrande_afr_nso
147
+ - mmlu_clinical_afr_clinical_knowledge_nso
148
+ - mmlu_clinical_afr_college_medicine_nso
149
+ - mmlu_clinical_afr_virology_nso
150
+
151
+ - name: benchmarks_sn
152
+ display_name: Shona Benchmarks
153
+ description: Results for scenarios translated to Shona by human translators.
154
+ category: Results by language
155
+ subgroups:
156
+ - winogrande_afr_sn
157
+ - mmlu_clinical_afr_clinical_knowledge_sn
158
+ - mmlu_clinical_afr_college_medicine_sn
159
+ - mmlu_clinical_afr_virology_sn
160
+
161
+ - name: benchmarks_st
162
+ display_name: Sesotho Benchmarks
163
+ description: Results for scenarios translated to Sesotho by human translators.
164
+ category: Results by language
165
+ subgroups:
166
+ - winogrande_afr_st
167
+ - mmlu_clinical_afr_clinical_knowledge_st
168
+ - mmlu_clinical_afr_college_medicine_st
169
+ - mmlu_clinical_afr_virology_st
170
+
171
+ - name: benchmarks_tn
172
+ display_name: Setswana Benchmarks
173
+ description: Results for scenarios translated to Setswana by human translators.
174
+ category: Results by language
175
+ subgroups:
176
+ - winogrande_afr_tn
177
+ - mmlu_clinical_afr_clinical_knowledge_tn
178
+ - mmlu_clinical_afr_college_medicine_tn
179
+ - mmlu_clinical_afr_virology_tn
180
+
181
+ - name: benchmarks_ts
182
+ display_name: Tsonga Benchmarks
183
+ description: Results for scenarios translated to Tsonga by human translators.
184
+ category: Results by language
185
+ subgroups:
186
+ - winogrande_afr_ts
187
+ - mmlu_clinical_afr_clinical_knowledge_ts
188
+ - mmlu_clinical_afr_college_medicine_ts
189
+ - mmlu_clinical_afr_virology_ts
190
+
191
+ - name: benchmarks_xh
192
+ display_name: Xhosa Benchmarks
193
+ description: Results for scenarios translated to Xhosa by human translators.
194
+ category: Results by language
195
+ subgroups:
196
+ - winogrande_afr_xh
197
+ - mmlu_clinical_afr_clinical_knowledge_xh
198
+ - mmlu_clinical_afr_college_medicine_xh
199
+ - mmlu_clinical_afr_virology_xh
200
+
201
+ - name: benchmarks_zu
202
+ display_name: Zulu Benchmarks
203
+ description: Results for scenarios translated to Zulu by human translators.
204
+ category: Results by language
205
+ subgroups:
206
+ - winogrande_afr_zu
207
+ - mmlu_clinical_afr_clinical_knowledge_zu
208
+ - mmlu_clinical_afr_college_medicine_zu
209
+ - mmlu_clinical_afr_virology_zu
210
+
211
+ display_name: WinoGrande
212
+ description: Results for WinoGrande by language.
213
+ category: Results by benchmark
214
+ subgroups:
215
+ - winogrande_afr_af
216
+ - winogrande_afr_am
217
+ - winogrande_afr_bm
218
+ - winogrande_afr_ig
219
+ - winogrande_afr_nso
220
+ - winogrande_afr_sn
221
+ - winogrande_afr_st
222
+ - winogrande_afr_tn
223
+ - winogrande_afr_ts
224
+ - winogrande_afr_xh
225
+ - winogrande_afr_zu
226
+
227
+ - name: mmlu_clinical_afr_clinical_knowledge_by_language
228
+ display_name: MMLU Clinical Knowledge
229
+ description: Results for MMLU Clinical Knowledge by language.
230
+ category: Results by benchmark
231
+ subgroups:
232
+ - mmlu_clinical_afr_clinical_knowledge_af
233
+ - mmlu_clinical_afr_clinical_knowledge_am
234
+ - mmlu_clinical_afr_clinical_knowledge_bm
235
+ - mmlu_clinical_afr_clinical_knowledge_ig
236
+ - mmlu_clinical_afr_clinical_knowledge_nso
237
+ - mmlu_clinical_afr_clinical_knowledge_sn
238
+ - mmlu_clinical_afr_clinical_knowledge_st
239
+ - mmlu_clinical_afr_clinical_knowledge_tn
240
+ - mmlu_clinical_afr_clinical_knowledge_ts
241
+ - mmlu_clinical_afr_clinical_knowledge_xh
242
+ - mmlu_clinical_afr_clinical_knowledge_zu
243
+
244
+ - name: mmlu_clinical_afr_college_medicine_by_language
245
+ display_name: MMLU College Medicine
246
+ description: Results for MMLU College Medicine by language.
247
+ category: Results by benchmark
248
+ subgroups:
249
+ - mmlu_clinical_afr_college_medicine_af
250
+ - mmlu_clinical_afr_college_medicine_am
251
+ - mmlu_clinical_afr_college_medicine_bm
252
+ - mmlu_clinical_afr_college_medicine_ig
253
+ - mmlu_clinical_afr_college_medicine_nso
254
+ - mmlu_clinical_afr_college_medicine_sn
255
+ - mmlu_clinical_afr_college_medicine_st
256
+ - mmlu_clinical_afr_college_medicine_tn
257
+ - mmlu_clinical_afr_college_medicine_ts
258
+ - mmlu_clinical_afr_college_medicine_xh
259
+ - mmlu_clinical_afr_college_medicine_zu
260
+
261
+ - name: mmlu_clinical_afr_virology_by_language
262
+ display_name: MMLU Virology
263
+ description: Results for MMLU Virology by language.
264
+ category: Results by benchmark
265
+ subgroups:
266
+ - mmlu_clinical_afr_virology_af
267
+ - mmlu_clinical_afr_virology_am
268
+ - mmlu_clinical_afr_virology_bm
269
+ - mmlu_clinical_afr_virology_ig
270
+ - mmlu_clinical_afr_virology_nso
271
+ - mmlu_clinical_afr_virology_sn
272
+ - mmlu_clinical_afr_virology_st
273
+ - mmlu_clinical_afr_virology_tn
274
+ - mmlu_clinical_afr_virology_ts
275
+ - mmlu_clinical_afr_virology_xh
276
+ - mmlu_clinical_afr_virology_zu
277
+
278
+ - name: winogrande_afr
279
+ display_name: Winogrande (11 African languages)
280
+ short_display_name: Winogrande (11 African languages)
281
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to 11 African languages by human translators
282
+ metric_groups:
283
+ - accuracy
284
+ environment:
285
+ main_name: exact_match
286
+ main_split: test
287
+ taxonomy:
288
+ task: multiple-choice reasoning
289
+ what: pronoun resolution
290
+ who: workers on Amazon Mechanical Turk
291
+ when: before 2019
292
+ language: 11 African languages
293
+
294
+ - name: winogrande_afr_af
295
+ display_name: Winogrande (Afrikaans)
296
+ short_display_name: Winogrande (Afrikaans)
297
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Afrikaans by human translators
298
+ metric_groups:
299
+ - accuracy
300
+ environment:
301
+ main_name: exact_match
302
+ main_split: test
303
+ taxonomy:
304
+ task: multiple-choice reasoning
305
+ what: pronoun resolution
306
+ who: workers on Amazon Mechanical Turk
307
+ when: before 2019
308
+ language: Afrikaans
309
+
310
+ - name: winogrande_afr_am
311
+ display_name: Winogrande (Amharic)
312
+ short_display_name: Winogrande (Amharic)
313
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Amharic by human translators
314
+ metric_groups:
315
+ - accuracy
316
+ environment:
317
+ main_name: exact_match
318
+ main_split: test
319
+ taxonomy:
320
+ task: multiple-choice reasoning
321
+ what: pronoun resolution
322
+ who: workers on Amazon Mechanical Turk
323
+ when: before 2019
324
+ language: Amharic
325
+
326
+ - name: winogrande_afr_bm
327
+ display_name: Winogrande (Bambara)
328
+ short_display_name: Winogrande (Bambara)
329
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Bambara by human translators
330
+ metric_groups:
331
+ - accuracy
332
+ environment:
333
+ main_name: exact_match
334
+ main_split: test
335
+ taxonomy:
336
+ task: multiple-choice reasoning
337
+ what: pronoun resolution
338
+ who: workers on Amazon Mechanical Turk
339
+ when: before 2019
340
+ language: Bambara
341
+
342
+ - name: winogrande_afr_ig
343
+ display_name: Winogrande (Igbo)
344
+ short_display_name: Winogrande (Igbo)
345
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Igbo by human translators
346
+ metric_groups:
347
+ - accuracy
348
+ environment:
349
+ main_name: exact_match
350
+ main_split: test
351
+ taxonomy:
352
+ task: multiple-choice reasoning
353
+ what: pronoun resolution
354
+ who: workers on Amazon Mechanical Turk
355
+ when: before 2019
356
+ language: Igbo
357
+
358
+ - name: winogrande_afr_nso
359
+ display_name: Winogrande (Sepedi)
360
+ short_display_name: Winogrande (Sepedi)
361
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Sepedi by human translators
362
+ metric_groups:
363
+ - accuracy
364
+ environment:
365
+ main_name: exact_match
366
+ main_split: test
367
+ taxonomy:
368
+ task: multiple-choice reasoning
369
+ what: pronoun resolution
370
+ who: workers on Amazon Mechanical Turk
371
+ when: before 2019
372
+ language: Sepedi
373
+
374
+ - name: winogrande_afr_sn
375
+ display_name: Winogrande (Shona)
376
+ short_display_name: Winogrande (Shona)
377
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Shona by human translators
378
+ metric_groups:
379
+ - accuracy
380
+ environment:
381
+ main_name: exact_match
382
+ main_split: test
383
+ taxonomy:
384
+ task: multiple-choice reasoning
385
+ what: pronoun resolution
386
+ who: workers on Amazon Mechanical Turk
387
+ when: before 2019
388
+ language: Shona
389
+
390
+ - name: winogrande_afr_st
391
+ display_name: Winogrande (Sesotho)
392
+ short_display_name: Winogrande (Sesotho)
393
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Sesotho by human translators
394
+ metric_groups:
395
+ - accuracy
396
+ environment:
397
+ main_name: exact_match
398
+ main_split: test
399
+ taxonomy:
400
+ task: multiple-choice reasoning
401
+ what: pronoun resolution
402
+ who: workers on Amazon Mechanical Turk
403
+ when: before 2019
404
+ language: Sesotho
405
+
406
+ - name: winogrande_afr_tn
407
+ display_name: Winogrande (Setswana)
408
+ short_display_name: Winogrande (Setswana)
409
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Setswana by human translators
410
+ metric_groups:
411
+ - accuracy
412
+ environment:
413
+ main_name: exact_match
414
+ main_split: test
415
+ taxonomy:
416
+ task: multiple-choice reasoning
417
+ what: pronoun resolution
418
+ who: workers on Amazon Mechanical Turk
419
+ when: before 2019
420
+ language: Setswana
421
+
422
+ - name: winogrande_afr_ts
423
+ display_name: Winogrande (Tsonga)
424
+ short_display_name: Winogrande (Tsonga)
425
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Tsonga by human translators
426
+ metric_groups:
427
+ - accuracy
428
+ environment:
429
+ main_name: exact_match
430
+ main_split: test
431
+ taxonomy:
432
+ task: multiple-choice reasoning
433
+ what: pronoun resolution
434
+ who: workers on Amazon Mechanical Turk
435
+ when: before 2019
436
+ language: Tsonga
437
+
438
+ - name: winogrande_afr_xh
439
+ display_name: Winogrande (Xhosa)
440
+ short_display_name: Winogrande (Xhosa)
441
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Xhosa by human translators
442
+ metric_groups:
443
+ - accuracy
444
+ environment:
445
+ main_name: exact_match
446
+ main_split: test
447
+ taxonomy:
448
+ task: multiple-choice reasoning
449
+ what: pronoun resolution
450
+ who: workers on Amazon Mechanical Turk
451
+ when: before 2019
452
+ language: Xhosa
453
+
454
+ - name: winogrande_afr_zu
455
+ display_name: Winogrande (Zulu)
456
+ short_display_name: Winogrande (Zulu)
457
+ description: The multiple-choice reasoning benchmark Winogrande ([Sakaguchi et al. 2021](https://arxiv.org/abs/1907.10641)) translated to Zulu by human translators
458
+ metric_groups:
459
+ - accuracy
460
+ environment:
461
+ main_name: exact_match
462
+ main_split: test
463
+ taxonomy:
464
+ task: multiple-choice reasoning
465
+ what: pronoun resolution
466
+ who: workers on Amazon Mechanical Turk
467
+ when: before 2019
468
+ language: Zulu
469
+
470
+ - name: mmlu_clinical_afr_clinical_knowledge
471
+ display_name: MMLU Clinical Knowledge (11 African languages)
472
+ short_display_name: MMLU Clinical Knowledge (11 African languages)
473
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to 11 African languages by human translators.
474
+ metric_groups:
475
+ - accuracy
476
+ environment:
477
+ main_name: exact_match
478
+ main_split: test
479
+ taxonomy:
480
+ task: multiple-choice question answering
481
+ what: clinical knowledge
482
+ who: various online sources
483
+ when: before 2021
484
+ language: 11 African languages
485
+
486
+ - name: mmlu_clinical_afr_clinical_knowledge_af
487
+ display_name: MMLU Clinical Knowledge (Afrikaans)
488
+ short_display_name: MMLU Clinical Knowledge (Afrikaans)
489
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Afrikaans by human translators.
490
+ metric_groups:
491
+ - accuracy
492
+ environment:
493
+ main_name: exact_match
494
+ main_split: test
495
+ taxonomy:
496
+ task: multiple-choice question answering
497
+ what: clinical knowledge
498
+ who: various online sources
499
+ when: before 2021
500
+ language: Afrikaans
501
+
502
+ - name: mmlu_clinical_afr_clinical_knowledge_am
503
+ display_name: MMLU Clinical Knowledge (Amharic)
504
+ short_display_name: MMLU Clinical Knowledge (Amharic)
505
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Amharic by human translators.
506
+ metric_groups:
507
+ - accuracy
508
+ environment:
509
+ main_name: exact_match
510
+ main_split: test
511
+ taxonomy:
512
+ task: multiple-choice question answering
513
+ what: clinical knowledge
514
+ who: various online sources
515
+ when: before 2021
516
+ language: Amharic
517
+
518
+ - name: mmlu_clinical_afr_clinical_knowledge_bm
519
+ display_name: MMLU Clinical Knowledge (Bambara)
520
+ short_display_name: MMLU Clinical Knowledge (Bambara)
521
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Bambara by human translators.
522
+ metric_groups:
523
+ - accuracy
524
+ environment:
525
+ main_name: exact_match
526
+ main_split: test
527
+ taxonomy:
528
+ task: multiple-choice question answering
529
+ what: clinical knowledge
530
+ who: various online sources
531
+ when: before 2021
532
+ language: Bambara
533
+
534
+ - name: mmlu_clinical_afr_clinical_knowledge_ig
535
+ display_name: MMLU Clinical Knowledge (Igbo)
536
+ short_display_name: MMLU Clinical Knowledge (Igbo)
537
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Igbo by human translators.
538
+ metric_groups:
539
+ - accuracy
540
+ environment:
541
+ main_name: exact_match
542
+ main_split: test
543
+ taxonomy:
544
+ task: multiple-choice question answering
545
+ what: clinical knowledge
546
+ who: various online sources
547
+ when: before 2021
548
+ language: Igbo
549
+
550
+ - name: mmlu_clinical_afr_clinical_knowledge_nso
551
+ display_name: MMLU Clinical Knowledge (Sepedi)
552
+ short_display_name: MMLU Clinical Knowledge (Sepedi)
553
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sepedi by human translators.
554
+ metric_groups:
555
+ - accuracy
556
+ environment:
557
+ main_name: exact_match
558
+ main_split: test
559
+ taxonomy:
560
+ task: multiple-choice question answering
561
+ what: clinical knowledge
562
+ who: various online sources
563
+ when: before 2021
564
+ language: Sepedi
565
+
566
+ - name: mmlu_clinical_afr_clinical_knowledge_sn
567
+ display_name: MMLU Clinical Knowledge (Shona)
568
+ short_display_name: MMLU Clinical Knowledge (Shona)
569
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Shona by human translators.
570
+ metric_groups:
571
+ - accuracy
572
+ environment:
573
+ main_name: exact_match
574
+ main_split: test
575
+ taxonomy:
576
+ task: multiple-choice question answering
577
+ what: clinical knowledge
578
+ who: various online sources
579
+ when: before 2021
580
+ language: Shona
581
+
582
+ - name: mmlu_clinical_afr_clinical_knowledge_st
583
+ display_name: MMLU Clinical Knowledge (Sesotho)
584
+ short_display_name: MMLU Clinical Knowledge (Sesotho)
585
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sesotho by human translators.
586
+ metric_groups:
587
+ - accuracy
588
+ environment:
589
+ main_name: exact_match
590
+ main_split: test
591
+ taxonomy:
592
+ task: multiple-choice question answering
593
+ what: clinical knowledge
594
+ who: various online sources
595
+ when: before 2021
596
+ language: Sesotho
597
+
598
+ - name: mmlu_clinical_afr_clinical_knowledge_tn
599
+ display_name: MMLU Clinical Knowledge (Setswana)
600
+ short_display_name: MMLU Clinical Knowledge (Setswana)
601
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Setswana by human translators.
602
+ metric_groups:
603
+ - accuracy
604
+ environment:
605
+ main_name: exact_match
606
+ main_split: test
607
+ taxonomy:
608
+ task: multiple-choice question answering
609
+ what: clinical knowledge
610
+ who: various online sources
611
+ when: before 2021
612
+ language: Setswana
613
+
614
+ - name: mmlu_clinical_afr_clinical_knowledge_ts
615
+ display_name: MMLU Clinical Knowledge (Tsonga)
616
+ short_display_name: MMLU Clinical Knowledge (Tsonga)
617
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Tsonga by human translators.
618
+ metric_groups:
619
+ - accuracy
620
+ environment:
621
+ main_name: exact_match
622
+ main_split: test
623
+ taxonomy:
624
+ task: multiple-choice question answering
625
+ what: clinical knowledge
626
+ who: various online sources
627
+ when: before 2021
628
+ language: Tsonga
629
+
630
+ - name: mmlu_clinical_afr_clinical_knowledge_xh
631
+ display_name: MMLU Clinical Knowledge (Xhosa)
632
+ short_display_name: MMLU Clinical Knowledge (Xhosa)
633
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Xhosa by human translators.
634
+ metric_groups:
635
+ - accuracy
636
+ environment:
637
+ main_name: exact_match
638
+ main_split: test
639
+ taxonomy:
640
+ task: multiple-choice question answering
641
+ what: clinical knowledge
642
+ who: various online sources
643
+ when: before 2021
644
+ language: Xhosa
645
+
646
+ - name: mmlu_clinical_afr_clinical_knowledge_zu
647
+ display_name: MMLU Clinical Knowledge (Zulu)
648
+ short_display_name: MMLU Clinical Knowledge (Zulu)
649
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Zulu by human translators.
650
+ metric_groups:
651
+ - accuracy
652
+ environment:
653
+ main_name: exact_match
654
+ main_split: test
655
+ taxonomy:
656
+ task: multiple-choice question answering
657
+ what: clinical knowledge
658
+ who: various online sources
659
+ when: before 2021
660
+ language: Zulu
661
+
662
+ - name: mmlu_clinical_afr_college_medicine
663
+ display_name: MMLU College Medicine (11 African languages)
664
+ short_display_name: MMLU College Medicine (11 African languages)
665
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to 11 African languages by human translators.
666
+ metric_groups:
667
+ - accuracy
668
+ environment:
669
+ main_name: exact_match
670
+ main_split: test
671
+ taxonomy:
672
+ task: multiple-choice question answering
673
+ what: college medicine
674
+ who: various online sources
675
+ when: before 2021
676
+ language: 11 African languages
677
+
678
+ - name: mmlu_clinical_afr_college_medicine_af
679
+ display_name: MMLU College Medicine (Afrikaans)
680
+ short_display_name: MMLU College Medicine (Afrikaans)
681
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Afrikaans by human translators.
682
+ metric_groups:
683
+ - accuracy
684
+ environment:
685
+ main_name: exact_match
686
+ main_split: test
687
+ taxonomy:
688
+ task: multiple-choice question answering
689
+ what: college medicine
690
+ who: various online sources
691
+ when: before 2021
692
+ language: Afrikaans
693
+
694
+ - name: mmlu_clinical_afr_college_medicine_am
695
+ display_name: MMLU College Medicine (Amharic)
696
+ short_display_name: MMLU College Medicine (Amharic)
697
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Amharic by human translators.
698
+ metric_groups:
699
+ - accuracy
700
+ environment:
701
+ main_name: exact_match
702
+ main_split: test
703
+ taxonomy:
704
+ task: multiple-choice question answering
705
+ what: college medicine
706
+ who: various online sources
707
+ when: before 2021
708
+ language: Amharic
709
+
710
+ - name: mmlu_clinical_afr_college_medicine_bm
711
+ display_name: MMLU College Medicine (Bambara)
712
+ short_display_name: MMLU College Medicine (Bambara)
713
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Bambara by human translators.
714
+ metric_groups:
715
+ - accuracy
716
+ environment:
717
+ main_name: exact_match
718
+ main_split: test
719
+ taxonomy:
720
+ task: multiple-choice question answering
721
+ what: college medicine
722
+ who: various online sources
723
+ when: before 2021
724
+ language: Bambara
725
+
726
+ - name: mmlu_clinical_afr_college_medicine_ig
727
+ display_name: MMLU College Medicine (Igbo)
728
+ short_display_name: MMLU College Medicine (Igbo)
729
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Igbo by human translators.
730
+ metric_groups:
731
+ - accuracy
732
+ environment:
733
+ main_name: exact_match
734
+ main_split: test
735
+ taxonomy:
736
+ task: multiple-choice question answering
737
+ what: college medicine
738
+ who: various online sources
739
+ when: before 2021
740
+ language: Igbo
741
+
742
+ - name: mmlu_clinical_afr_college_medicine_nso
743
+ display_name: MMLU College Medicine (Sepedi)
744
+ short_display_name: MMLU College Medicine (Sepedi)
745
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sepedi by human translators.
746
+ metric_groups:
747
+ - accuracy
748
+ environment:
749
+ main_name: exact_match
750
+ main_split: test
751
+ taxonomy:
752
+ task: multiple-choice question answering
753
+ what: college medicine
754
+ who: various online sources
755
+ when: before 2021
756
+ language: Sepedi
757
+
758
+ - name: mmlu_clinical_afr_college_medicine_sn
759
+ display_name: MMLU College Medicine (Shona)
760
+ short_display_name: MMLU College Medicine (Shona)
761
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Shona by human translators.
762
+ metric_groups:
763
+ - accuracy
764
+ environment:
765
+ main_name: exact_match
766
+ main_split: test
767
+ taxonomy:
768
+ task: multiple-choice question answering
769
+ what: college medicine
770
+ who: various online sources
771
+ when: before 2021
772
+ language: Shona
773
+
774
+ - name: mmlu_clinical_afr_college_medicine_st
775
+ display_name: MMLU College Medicine (Sesotho)
776
+ short_display_name: MMLU College Medicine (Sesotho)
777
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sesotho by human translators.
778
+ metric_groups:
779
+ - accuracy
780
+ environment:
781
+ main_name: exact_match
782
+ main_split: test
783
+ taxonomy:
784
+ task: multiple-choice question answering
785
+ what: college medicine
786
+ who: various online sources
787
+ when: before 2021
788
+ language: Sesotho
789
+
790
+ - name: mmlu_clinical_afr_college_medicine_tn
791
+ display_name: MMLU College Medicine (Setswana)
792
+ short_display_name: MMLU College Medicine (Setswana)
793
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Setswana by human translators.
794
+ metric_groups:
795
+ - accuracy
796
+ environment:
797
+ main_name: exact_match
798
+ main_split: test
799
+ taxonomy:
800
+ task: multiple-choice question answering
801
+ what: college medicine
802
+ who: various online sources
803
+ when: before 2021
804
+ language: Setswana
805
+
806
+ - name: mmlu_clinical_afr_college_medicine_ts
807
+ display_name: MMLU College Medicine (Tsonga)
808
+ short_display_name: MMLU College Medicine (Tsonga)
809
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Tsonga by human translators.
810
+ metric_groups:
811
+ - accuracy
812
+ environment:
813
+ main_name: exact_match
814
+ main_split: test
815
+ taxonomy:
816
+ task: multiple-choice question answering
817
+ what: college medicine
818
+ who: various online sources
819
+ when: before 2021
820
+ language: Tsonga
821
+
822
+ - name: mmlu_clinical_afr_college_medicine_xh
823
+ display_name: MMLU College Medicine (Xhosa)
824
+ short_display_name: MMLU College Medicine (Xhosa)
825
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Xhosa by human translators.
826
+ metric_groups:
827
+ - accuracy
828
+ environment:
829
+ main_name: exact_match
830
+ main_split: test
831
+ taxonomy:
832
+ task: multiple-choice question answering
833
+ what: college medicine
834
+ who: various online sources
835
+ when: before 2021
836
+ language: Xhosa
837
+
838
+ - name: mmlu_clinical_afr_college_medicine_zu
839
+ display_name: MMLU College Medicine (Zulu)
840
+ short_display_name: MMLU College Medicine (Zulu)
841
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Zulu by human translators.
842
+ metric_groups:
843
+ - accuracy
844
+ environment:
845
+ main_name: exact_match
846
+ main_split: test
847
+ taxonomy:
848
+ task: multiple-choice question answering
849
+ what: college medicine
850
+ who: various online sources
851
+ when: before 2021
852
+ language: Zulu
853
+
854
+ - name: mmlu_clinical_afr_virology
855
+ display_name: MMLU Virology (11 African languages)
856
+ short_display_name: MMLU Virology (11 African languages)
857
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to 11 African languages by human translators.
858
+ metric_groups:
859
+ - accuracy
860
+ environment:
861
+ main_name: exact_match
862
+ main_split: test
863
+ taxonomy:
864
+ task: multiple-choice question answering
865
+ what: virology
866
+ who: various online sources
867
+ when: before 2021
868
+ language: 11 African languages
869
+
870
+ - name: mmlu_clinical_afr_virology_af
871
+ display_name: MMLU Virology (Afrikaans)
872
+ short_display_name: MMLU Virology (Afrikaans)
873
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Afrikaans by human translators.
874
+ metric_groups:
875
+ - accuracy
876
+ environment:
877
+ main_name: exact_match
878
+ main_split: test
879
+ taxonomy:
880
+ task: multiple-choice question answering
881
+ what: virology
882
+ who: various online sources
883
+ when: before 2021
884
+ language: Afrikaans
885
+
886
+ - name: mmlu_clinical_afr_virology_am
887
+ display_name: MMLU Virology (Amharic)
888
+ short_display_name: MMLU Virology (Amharic)
889
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Amharic by human translators.
890
+ metric_groups:
891
+ - accuracy
892
+ environment:
893
+ main_name: exact_match
894
+ main_split: test
895
+ taxonomy:
896
+ task: multiple-choice question answering
897
+ what: virology
898
+ who: various online sources
899
+ when: before 2021
900
+ language: Amharic
901
+
902
+ - name: mmlu_clinical_afr_virology_bm
903
+ display_name: MMLU Virology (Bambara)
904
+ short_display_name: MMLU Virology (Bambara)
905
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Bambara by human translators.
906
+ metric_groups:
907
+ - accuracy
908
+ environment:
909
+ main_name: exact_match
910
+ main_split: test
911
+ taxonomy:
912
+ task: multiple-choice question answering
913
+ what: virology
914
+ who: various online sources
915
+ when: before 2021
916
+ language: Bambara
917
+
918
+ - name: mmlu_clinical_afr_virology_ig
919
+ display_name: MMLU Virology (Igbo)
920
+ short_display_name: MMLU Virology (Igbo)
921
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Igbo by human translators.
922
+ metric_groups:
923
+ - accuracy
924
+ environment:
925
+ main_name: exact_match
926
+ main_split: test
927
+ taxonomy:
928
+ task: multiple-choice question answering
929
+ what: virology
930
+ who: various online sources
931
+ when: before 2021
932
+ language: Igbo
933
+
934
+ - name: mmlu_clinical_afr_virology_nso
935
+ display_name: MMLU Virology (Sepedi)
936
+ short_display_name: MMLU Virology (Sepedi)
937
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sepedi by human translators.
938
+ metric_groups:
939
+ - accuracy
940
+ environment:
941
+ main_name: exact_match
942
+ main_split: test
943
+ taxonomy:
944
+ task: multiple-choice question answering
945
+ what: virology
946
+ who: various online sources
947
+ when: before 2021
948
+ language: Sepedi
949
+
950
+ - name: mmlu_clinical_afr_virology_sn
951
+ display_name: MMLU Virology (Shona)
952
+ short_display_name: MMLU Virology (Shona)
953
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Shona by human translators.
954
+ metric_groups:
955
+ - accuracy
956
+ environment:
957
+ main_name: exact_match
958
+ main_split: test
959
+ taxonomy:
960
+ task: multiple-choice question answering
961
+ what: virology
962
+ who: various online sources
963
+ when: before 2021
964
+ language: Shona
965
+
966
+ - name: mmlu_clinical_afr_virology_st
967
+ display_name: MMLU Virology (Sesotho)
968
+ short_display_name: MMLU Virology (Sesotho)
969
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Sesotho by human translators.
970
+ metric_groups:
971
+ - accuracy
972
+ environment:
973
+ main_name: exact_match
974
+ main_split: test
975
+ taxonomy:
976
+ task: multiple-choice question answering
977
+ what: virology
978
+ who: various online sources
979
+ when: before 2021
980
+ language: Sesotho
981
+
982
+ - name: mmlu_clinical_afr_virology_tn
983
+ display_name: MMLU Virology (Setswana)
984
+ short_display_name: MMLU Virology (Setswana)
985
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Setswana by human translators.
986
+ metric_groups:
987
+ - accuracy
988
+ environment:
989
+ main_name: exact_match
990
+ main_split: test
991
+ taxonomy:
992
+ task: multiple-choice question answering
993
+ what: virology
994
+ who: various online sources
995
+ when: before 2021
996
+ language: Setswana
997
+
998
+ - name: mmlu_clinical_afr_virology_ts
999
+ display_name: MMLU Virology (Tsonga)
1000
+ short_display_name: MMLU Virology (Tsonga)
1001
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Tsonga by human translators.
1002
+ metric_groups:
1003
+ - accuracy
1004
+ environment:
1005
+ main_name: exact_match
1006
+ main_split: test
1007
+ taxonomy:
1008
+ task: multiple-choice question answering
1009
+ what: virology
1010
+ who: various online sources
1011
+ when: before 2021
1012
+ language: Tsonga
1013
+
1014
+ - name: mmlu_clinical_afr_virology_xh
1015
+ display_name: MMLU Virology (Xhosa)
1016
+ short_display_name: MMLU Virology (Xhosa)
1017
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Xhosa by human translators.
1018
+ metric_groups:
1019
+ - accuracy
1020
+ environment:
1021
+ main_name: exact_match
1022
+ main_split: test
1023
+ taxonomy:
1024
+ task: multiple-choice question answering
1025
+ what: virology
1026
+ who: various online sources
1027
+ when: before 2021
1028
+ language: Xhosa
1029
+
1030
+ - name: mmlu_clinical_afr_virology_zu
1031
+ display_name: MMLU Virology (Zulu)
1032
+ short_display_name: MMLU Virology (Zulu)
1033
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) ([Hendrycks et al. 2021](https://arxiv.org/abs/2009.03300)) benchmark translated to Zulu by human translators.
1034
+ metric_groups:
1035
+ - accuracy
1036
+ environment:
1037
+ main_name: exact_match
1038
+ main_split: test
1039
+ taxonomy:
1040
+ task: multiple-choice question answering
1041
+ what: virology
1042
+ who: various online sources
1043
+ when: before 2021
1044
+ language: Zulu
1045
+