crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -2,6 +2,7 @@ import os
2
2
  from typing import Dict, List
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -142,3 +144,96 @@ class ThaiExamScenario(Scenario):
142
144
  instances.extend(self.process_jsonl(jsonl_path, splits[split]))
143
145
 
144
146
  return instances
147
+
148
+ def get_metadata(self) -> ScenarioMetadata:
149
+ if self.exam == "onet":
150
+ return ScenarioMetadata(
151
+ name="thai_exam_onet",
152
+ display_name="ONET",
153
+ description="The Ordinary National Educational Test (ONET) is an examination for students "
154
+ "in Thailand. We select the grade-12 ONET exam, which comprises 5 subjects and "
155
+ "each question has 5 choices. These subjects are Thai, English, Mathematics, "
156
+ "Social Studies, and Science. Amounting to a total of 170 questions and "
157
+ "options.\n",
158
+ taxonomy=TaxonomyInfo(
159
+ task="question answering",
160
+ what="high school / medical school academic knowledge",
161
+ when="?",
162
+ who="n/a",
163
+ language="Thai and English",
164
+ ),
165
+ main_metric="exact_match",
166
+ main_split="test",
167
+ )
168
+ elif self.exam == "ic":
169
+ return ScenarioMetadata(
170
+ name="thai_exam_ic",
171
+ display_name="IC",
172
+ description="The Investment Consultant (IC) examination, a licensing test for investment "
173
+ "professionals in Thailand. Developed by the Stock Exchange of Thailand (SET), "
174
+ "features 4 choices per question. We extracted questions for levels 1, 2, and 3 "
175
+ "resulting in a total of 95 questions and options.\n",
176
+ taxonomy=TaxonomyInfo(
177
+ task="question answering",
178
+ what="licensing for investment professionals",
179
+ when="?",
180
+ who="n/a",
181
+ language="Thai",
182
+ ),
183
+ main_metric="exact_match",
184
+ main_split="test",
185
+ )
186
+ elif self.exam == "tgat":
187
+ return ScenarioMetadata(
188
+ name="thai_exam_tgat",
189
+ display_name="TGAT",
190
+ description="The Thai General Aptitude Test (TGAT), a national high school examination in "
191
+ "Thailand. Focuses on critical and logical thinking skills. We collected a "
192
+ "total of 90 questions and answers. The TGAT consists of four choices per "
193
+ "question.\n",
194
+ taxonomy=TaxonomyInfo(
195
+ task="question answering",
196
+ what="high school level questions on reasoning",
197
+ when="?",
198
+ who="n/a",
199
+ language="English",
200
+ ),
201
+ main_metric="exact_match",
202
+ main_split="test",
203
+ )
204
+ elif self.exam == "tpat1":
205
+ return ScenarioMetadata(
206
+ name="thai_exam_tpat1",
207
+ display_name="TPAT-1",
208
+ description="TBD",
209
+ taxonomy=TaxonomyInfo(
210
+ task="question answering",
211
+ what="high school / medical school academic knowledge",
212
+ when="?",
213
+ who="n/a",
214
+ language="Thai",
215
+ ),
216
+ main_metric="exact_match",
217
+ main_split="test",
218
+ )
219
+ elif self.exam == "a_level":
220
+ return ScenarioMetadata(
221
+ name="thai_exam_a_level",
222
+ display_name="A-Level",
223
+ description="An academic knowledge assessment examination (Applied Knowledge Level) that "
224
+ "covers general foundational subjects taught in schools. The content assessed "
225
+ "in this examination aligns with the curriculum guidelines and emphasizes the "
226
+ "practical application of knowledge in daily life. We collected a total of 175 "
227
+ "questions and answers.\n",
228
+ taxonomy=TaxonomyInfo(
229
+ task="question answering",
230
+ what="high school academic knowledge",
231
+ when="?",
232
+ who="n/a",
233
+ language="Thai and English",
234
+ ),
235
+ main_metric="exact_match",
236
+ main_split="test",
237
+ )
238
+ else:
239
+ raise ValueError(f"Unknown exam: {self.exam}")
@@ -5,9 +5,10 @@ import sys
5
5
  import requests
6
6
  from typing import Dict, List
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded
9
10
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
10
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
11
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
11
12
 
12
13
 
13
14
  class ThePileScenario(Scenario):
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
146
147
  instances = [instances[i] for i in indices]
147
148
 
148
149
  return instances
150
+
151
+ def get_metadata(self) -> ScenarioMetadata:
152
+ return ScenarioMetadata(
153
+ name="the_pile",
154
+ display_name="The Pile",
155
+ description="The Pile corpus for measuring lanugage model performance across various "
156
+ "domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
157
+ taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
158
+ main_metric="bits_per_byte",
159
+ main_split="test",
160
+ )
@@ -2,6 +2,7 @@ import csv
2
2
  import os
3
3
  from typing import List, Dict, Any
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
154
156
  valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
155
157
 
156
158
  return train_instances + valid_instances
159
+
160
+ def get_metadata(self) -> ScenarioMetadata:
161
+ return ScenarioMetadata(
162
+ name="truthful_qa",
163
+ display_name="TruthfulQA",
164
+ description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
165
+ "knowledge in question answering [(Lin et al., "
166
+ "2022)](https://aclanthology.org/2022.acl-long.229/).",
167
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
168
+ main_metric="exact_match",
169
+ main_split="valid",
170
+ )
@@ -2,9 +2,10 @@ import csv
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
  CODALAB_URI_TEMPLATE: str = (
10
11
  "https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
56
57
  instances.append(instance)
57
58
 
58
59
  return instances
60
+
61
+ def get_metadata(self) -> ScenarioMetadata:
62
+ return ScenarioMetadata(
63
+ name="twitter_aae",
64
+ display_name="TwitterAAE",
65
+ description="The TwitterAAE corpus of [Blodgett et al. "
66
+ "(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
67
+ "performance in tweets as a function of speaker dialect.",
68
+ taxonomy=TaxonomyInfo(
69
+ task="language modeling",
70
+ what="?",
71
+ when="?",
72
+ who="?",
73
+ language="English (AAE-aligned and White-aligned)",
74
+ ),
75
+ main_metric="bits_per_byte",
76
+ main_split="test",
77
+ )
@@ -2,8 +2,9 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
7
8
 
8
9
 
9
10
  class VicunaScenario(Scenario):
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
47
48
  )
48
49
  instances.append(instance)
49
50
  return instances
51
+
52
+ def get_metadata(self) -> ScenarioMetadata:
53
+ return ScenarioMetadata(
54
+ name="vicuna",
55
+ display_name="Vicuna",
56
+ short_display_name="Vicuna",
57
+ description="The set of prompts used by the "
58
+ "[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
59
+ "instruction-following models.",
60
+ taxonomy=TaxonomyInfo(
61
+ task="open-ended instruction following",
62
+ what="Instructions for LLMs",
63
+ when="Before 2023",
64
+ who="Unknown",
65
+ language="English",
66
+ ),
67
+ main_metric="Helpfulness",
68
+ main_split="test",
69
+ )
@@ -2,6 +2,7 @@ import os
2
2
  from typing import List, Dict
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  PID_TO_NAME = {
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
183
185
  instances.append(instance)
184
186
 
185
187
  return instances
188
+
189
+ def get_metadata(self) -> ScenarioMetadata:
190
+ return ScenarioMetadata(
191
+ name="wikifact",
192
+ display_name="WikiFact",
193
+ description="Scenario introduced in this work, inspired by [Petroni et al. "
194
+ "(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
195
+ "knowledge.",
196
+ taxonomy=TaxonomyInfo(
197
+ task="knowledge base completion",
198
+ what="entity-relation-entity triples in natural language form",
199
+ when="?",
200
+ who="automatically generated from templates",
201
+ language="structured English",
202
+ ),
203
+ main_metric="quasi_exact_match",
204
+ main_split="test",
205
+ )
@@ -2,11 +2,13 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
8
9
  TEST_SPLIT,
9
10
  Input,
11
+ ScenarioMetadata,
10
12
  )
11
13
  from helm.common.general import ensure_directory_exists
12
14
 
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
81
83
  instances.append(instance)
82
84
 
83
85
  return instances
86
+
87
+ def get_metadata(self) -> ScenarioMetadata:
88
+ return ScenarioMetadata(
89
+ name=self.name,
90
+ display_name="WildBench",
91
+ description=self.description,
92
+ main_metric="wildbench_score_rescaled",
93
+ main_split="test",
94
+ taxonomy=TaxonomyInfo(
95
+ task="instruction following",
96
+ what="GPT-judged instruction following with instructions collected from real-user conversations",
97
+ who="real-world users",
98
+ when="2024",
99
+ language="English",
100
+ ),
101
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import List, Any
2
2
  from datasets import load_dataset
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.common.hierarchical_logger import htrack_block
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -106,3 +108,20 @@ class WMT14Scenario(Scenario):
106
108
  )
107
109
  )
108
110
  return instances
111
+
112
+ def get_metadata(self) -> ScenarioMetadata:
113
+ return ScenarioMetadata(
114
+ name="wmt_14",
115
+ display_name="WMT 2014",
116
+ description="WMT 2014 is a collection of machine translation datasets "
117
+ "[(website)](https://www.statmt.org/wmt14/index.html).",
118
+ taxonomy=TaxonomyInfo(
119
+ task="machine translation",
120
+ what="multilingual sentences",
121
+ when="before 2014",
122
+ who="Europarl, news, Common Crawl, etc.",
123
+ language="English, French, Czech, etc.",
124
+ ),
125
+ main_metric="bleu_4",
126
+ main_split="test",
127
+ )
@@ -92,6 +92,12 @@ metrics:
92
92
  short_display_name: PEM
93
93
  description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
94
94
  lower_is_better: false
95
+ - name: alrage_score
96
+ # TODO: should call this prefix_quasi_exact_match
97
+ display_name: ALRAGE Score
98
+ short_display_name: Score
99
+ description: Score of the output judged by GPT-4o.
100
+ lower_is_better: false
95
101
 
96
102
  ############################################################
97
103
  perturbations: []
@@ -134,17 +140,20 @@ run_groups:
134
140
  - name: arabic_scenarios
135
141
  display_name: Arabic Scenarios
136
142
  description: Arabic Scenarios
137
- category: All scenarios
143
+ category: Scenarios
138
144
  subgroups:
139
- - mmmlu
140
- - arabic_mmlu
141
145
  - alghafa
142
- - exams_multilingual
146
+ - arabic_mmlu
147
+ - arabic_exams
148
+ - madinah_qa
143
149
  - aratrust
150
+ - alrage
151
+ - mbzuai_human_translated_arabic_mmlu
144
152
 
145
- - name: mmmlu
146
- display_name: Multilingual MMLU (Arabic)
147
- description: Multilingual MMLU (Arabic)
153
+ - name: mbzuai_human_translated_arabic_mmlu
154
+ display_name: MBZUAI Human-Translated Arabic MMLU
155
+ short_display_name: Translated MMLU
156
+ description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
148
157
  metric_groups:
149
158
  - accuracy
150
159
  - efficiency
@@ -160,8 +169,8 @@ run_groups:
160
169
  language: Arabic
161
170
 
162
171
  - name: arabic_mmlu
163
- display_name: Arabic MMLU
164
- description: Arabic MMLU
172
+ display_name: ArabicMMLU
173
+ description: ArabicMMLU
165
174
  metric_groups:
166
175
  - accuracy
167
176
  - efficiency
@@ -193,9 +202,9 @@ run_groups:
193
202
  when: "before 2023"
194
203
  language: Arabic
195
204
 
196
- - name: exams_multilingual
197
- display_name: EXAMS (Arabic)
198
- description: EXAMS (Arabic)
205
+ - name: arabic_exams
206
+ display_name: Arabic EXAMS
207
+ description: Arabic EXAMS
199
208
  metric_groups:
200
209
  - accuracy
201
210
  - efficiency
@@ -226,3 +235,37 @@ run_groups:
226
235
  who: "academic exams writers and takers"
227
236
  when: "before 2024"
228
237
  language: Arabic
238
+
239
+ - name: alrage
240
+ display_name: ALRAGE
241
+ description: ALRAGE
242
+ metric_groups:
243
+ - accuracy
244
+ - efficiency
245
+ - general_information
246
+ environment:
247
+ main_name: alrage_score
248
+ main_split: test
249
+ taxonomy:
250
+ task: "openbook (RAG) open-ended question answering"
251
+ what: "?"
252
+ who: "?"
253
+ when: "?"
254
+ language: Arabic
255
+
256
+ - name: madinah_qa
257
+ display_name: MadinahQA
258
+ description: Arabic language competency benchmark
259
+ metric_groups:
260
+ - accuracy
261
+ - efficiency
262
+ - general_information
263
+ environment:
264
+ main_name: exact_match
265
+ main_split: test
266
+ taxonomy:
267
+ task: "question answering"
268
+ what: "academic questions about Arabic language"
269
+ who: "academic exams writers and takers"
270
+ when: "before 2024"
271
+ language: Arabic
@@ -191,31 +191,12 @@ run_groups:
191
191
  description: Scenarios for evaluating long context capabilities
192
192
  category: All scenarios
193
193
  subgroups:
194
- - ruler_hotpotqa
195
194
  - ruler_squad
196
- - infinite_bench_en_sum
197
- - infinite_bench_en_qa
195
+ - ruler_hotpotqa
198
196
  - infinite_bench_en_mc
197
+ - infinite_bench_en_sum
199
198
  - openai_mrcr
200
199
 
201
- - name: ruler_hotpotqa
202
- display_name: RULER HotPotQA
203
- description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
204
- metric_groups:
205
- - accuracy
206
- - general_information
207
- - annotation_metrics
208
- environment:
209
- main_name: ruler_string_match_part
210
- main_split: valid
211
- taxonomy:
212
- task: question answering with retrieval-augmented generation
213
- what: Wikipedia articles
214
- who: Wikipedia authors
215
- when: Before 2018
216
- language: English
217
-
218
-
219
200
  - name: ruler_squad
220
201
  display_name: RULER SQuAD
221
202
  description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
@@ -233,21 +214,21 @@ run_groups:
233
214
  when: Before 2018
234
215
  language: English
235
216
 
236
- - name: infinite_bench_en_qa
237
- display_name: ∞Bench En.QA
238
- description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
217
+ - name: ruler_hotpotqa
218
+ display_name: RULER HotPotQA
219
+ description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
239
220
  metric_groups:
240
221
  - accuracy
241
222
  - general_information
242
223
  - annotation_metrics
243
224
  environment:
244
- main_name: f1_score
245
- main_split: test
225
+ main_name: ruler_string_match_part
226
+ main_split: valid
246
227
  taxonomy:
247
- task: question answering
248
- what: Novels
249
- who: Novel authors
250
- when: Before 2024
228
+ task: question answering with retrieval-augmented generation
229
+ what: Wikipedia articles
230
+ who: Wikipedia authors
231
+ when: Before 2018
251
232
  language: English
252
233
 
253
234
  - name: infinite_bench_en_mc
@@ -484,6 +484,8 @@ run_groups:
484
484
  - ehrshot
485
485
  - head_qa
486
486
  - medbullets
487
+ - med_qa
488
+ - med_mcqa
487
489
  - medalign
488
490
  - shc_ptbm_med
489
491
  - shc_sei_med
@@ -657,6 +659,40 @@ run_groups:
657
659
  when: Any
658
660
  language: English
659
661
 
662
+ - name: med_qa
663
+ display_name: MedQA
664
+ description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
665
+ metric_groups:
666
+ - accuracy
667
+ - efficiency
668
+ - general_information
669
+ environment:
670
+ main_name: exact_match
671
+ main_split: test
672
+ taxonomy:
673
+ task: question answering
674
+ what: n/a
675
+ who: n/a
676
+ when: n/a
677
+ language: English
678
+
679
+ - name: med_mcqa
680
+ display_name: MedMCQA
681
+ description: MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address real-world medical entrance exam questions ([Flores et al. 2020](https://arxiv.org/abs/2203.14371)).
682
+ metric_groups:
683
+ - accuracy
684
+ - efficiency
685
+ - general_information
686
+ environment:
687
+ main_name: exact_match
688
+ main_split: valid
689
+ taxonomy:
690
+ task: question answering
691
+ what: n/a
692
+ who: n/a
693
+ when: n/a
694
+ language: English
695
+
660
696
  - name: medalign
661
697
  display_name: MedAlign
662
698
  short_display_name: MedAlign