crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -4,6 +4,7 @@ from typing import List
4
4
 
5
5
  from datasets import load_dataset, Features, Value, Sequence, Dataset
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
9
10
  Instance,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  Output,
13
14
  CORRECT_TAG,
14
15
  TEST_SPLIT,
16
+ ScenarioMetadata,
15
17
  )
16
18
  from helm.common.general import ensure_directory_exists
17
19
 
@@ -88,3 +90,22 @@ class InfiniteBenchEnMCScenario(Scenario):
88
90
  instances.append(instance)
89
91
 
90
92
  return instances
93
+
94
+ def get_metadata(self) -> ScenarioMetadata:
95
+ return ScenarioMetadata(
96
+ name="infinite_bench_en_mc",
97
+ display_name="∞Bench En.MC",
98
+ description="∞Bench En.MC is a multiple-choice question answering task that requires "
99
+ "locating and processing information within a novel, performing reasoning "
100
+ "through aggregation or filtering to derive answers. ([Zhang et al., "
101
+ "2024](https://arxiv.org/abs/2402.13718))",
102
+ taxonomy=TaxonomyInfo(
103
+ task="multiple-choice question answering",
104
+ what="Novels",
105
+ when="Before 2024",
106
+ who="Novel authors",
107
+ language="English",
108
+ ),
109
+ main_metric="exact_match",
110
+ main_split="test",
111
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  from typing import List
4
4
  from datasets import load_dataset, Features, Value, Sequence, Dataset
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  Output,
11
12
  CORRECT_TAG,
12
13
  TEST_SPLIT,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import ensure_directory_exists
15
17
 
@@ -77,3 +79,20 @@ class InfiniteBenchEnSumScenario(Scenario):
77
79
  instances.append(instance)
78
80
 
79
81
  return instances
82
+
83
+ def get_metadata(self) -> ScenarioMetadata:
84
+ return ScenarioMetadata(
85
+ name="infinite_bench_en_sum",
86
+ display_name="∞Bench En.Sum",
87
+ description="∞Bench En.Sum is a summarization task that requires generating a concise "
88
+ "summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))",
89
+ taxonomy=TaxonomyInfo(
90
+ task="multi-hop question answering",
91
+ what="Novels",
92
+ when="Before 2024",
93
+ who="Novel authors",
94
+ language="English",
95
+ ),
96
+ main_metric="rouge_l",
97
+ main_split="test",
98
+ )
@@ -2,8 +2,9 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
7
8
 
8
9
 
9
10
  class KoalaScenario(Scenario):
@@ -39,3 +40,22 @@ class KoalaScenario(Scenario):
39
40
  )
40
41
  instances.append(instance)
41
42
  return instances
43
+
44
+ def get_metadata(self) -> ScenarioMetadata:
45
+ return ScenarioMetadata(
46
+ name="koala",
47
+ display_name="Koala test dataset",
48
+ short_display_name="Koala test dataset",
49
+ description="The test dataset from the [Koala "
50
+ "paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating "
51
+ "instruction-following models.",
52
+ taxonomy=TaxonomyInfo(
53
+ task="open-ended instruction following",
54
+ what="Instructions for LLMs",
55
+ when="Before 2023",
56
+ who="Web users",
57
+ language="English",
58
+ ),
59
+ main_metric="Helpfulness",
60
+ main_split="test",
61
+ )
@@ -3,6 +3,7 @@ from typing import List, Dict
3
3
  import json
4
4
  import re
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
@@ -149,3 +151,22 @@ class KPIEDGARScenario(Scenario):
149
151
  with open(target_path, "r") as f:
150
152
  raw_dataset = json.load(f)
151
153
  return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
154
+
155
+ def get_metadata(self) -> ScenarioMetadata:
156
+ return ScenarioMetadata(
157
+ name="kpi_edgar",
158
+ display_name="KPI-EDGAR Financial Documents (Named Entity Recognition)",
159
+ short_display_name=None,
160
+ description="A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel "
161
+ "Dataset and Accompanying Metric for Relation Extraction from Financial "
162
+ "Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).",
163
+ taxonomy=TaxonomyInfo(
164
+ task="named entity recognition",
165
+ what="financial reports",
166
+ when="before 2022",
167
+ who="financial experts",
168
+ language="English",
169
+ ),
170
+ main_metric="adjusted_macro_f1_score",
171
+ main_split="test",
172
+ )
@@ -4,6 +4,7 @@ import json
4
4
  import re
5
5
 
6
6
  from typing import List
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  Input,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  TEST_SPLIT,
15
16
  CORRECT_TAG,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
@@ -127,3 +129,21 @@ class LegalContractSummarizationScenario(Scenario):
127
129
  instances.append(instance)
128
130
 
129
131
  return instances
132
+
133
+ def get_metadata(self) -> ScenarioMetadata:
134
+ return ScenarioMetadata(
135
+ name="legal_contract_summarization",
136
+ display_name="Legal Contract Summarization",
137
+ short_display_name=None,
138
+ description="Plain English Summarization of Contracts [(Manor et al., "
139
+ "2019)](https://aclanthology.org/W19-2201.pdf).",
140
+ taxonomy=TaxonomyInfo(
141
+ task="summarization",
142
+ what="legal contracts (e.g. terms of service, license agreements)",
143
+ when="before 2019",
144
+ who="lawyers",
145
+ language="English",
146
+ ),
147
+ main_metric="rouge_l",
148
+ main_split="test",
149
+ )
@@ -5,6 +5,7 @@ from typing import List, Optional, Any
5
5
  import datasets
6
6
  from datasets import load_dataset
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  Scenario,
10
11
  Instance,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  CORRECT_TAG,
16
17
  Input,
17
18
  Output,
19
+ ScenarioMetadata,
18
20
  )
19
21
 
20
22
  _ALL_LANGUAGES = {
@@ -205,3 +207,51 @@ class LegalSummarizationScenario(Scenario):
205
207
  )
206
208
 
207
209
  return instances
210
+
211
+ def get_metadata(self) -> ScenarioMetadata:
212
+ if self.dataset_name == "BillSum":
213
+ return ScenarioMetadata(
214
+ name="billsum_legal_summarization",
215
+ display_name="BillSum",
216
+ description="The BillSum benchmark for legal text summarization ([Kornilova & Eidelmann, "
217
+ "2020](https://aclanthology.org/D19-5406/)).",
218
+ taxonomy=TaxonomyInfo(
219
+ task="summarization", what="legal text from US bills", when=None, who="lawyers", language="English"
220
+ ),
221
+ main_metric="rouge_2",
222
+ main_split="test",
223
+ )
224
+ elif self.dataset_name == "MultiLexSum":
225
+ return ScenarioMetadata(
226
+ name="multilexsum_legal_summarization",
227
+ display_name="MultiLexSum",
228
+ description="The MultiLexSum benchmark for legal text summarization ([Shen et al., "
229
+ "2022](https://arxiv.org/abs/2206.10883)).",
230
+ taxonomy=TaxonomyInfo(
231
+ task="summarization",
232
+ what="legal text from US civil rights lawsuits",
233
+ when=None,
234
+ who="lawyers",
235
+ language="English",
236
+ ),
237
+ main_metric="rouge_2",
238
+ main_split="test",
239
+ )
240
+ elif self.dataset_name == "EurLexSum":
241
+ return ScenarioMetadata(
242
+ name="eurlexsum_legal_summarization",
243
+ display_name="EurLexSum",
244
+ description="The EurLexSum benchmark for legal text summarization ([Aumiller et al., "
245
+ "2022](https://arxiv.org/abs/2210.13448)).",
246
+ taxonomy=TaxonomyInfo(
247
+ task="summarization",
248
+ what="legal text from EU legislation",
249
+ when="1960 - 2020",
250
+ who="lawyers",
251
+ language="English",
252
+ ),
253
+ main_metric="rouge_2",
254
+ main_split="test",
255
+ )
256
+ else:
257
+ raise Exception(f"Unknown dataset {self.dataset_name}")
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
102
104
  instances.append(instance)
103
105
 
104
106
  return instances
107
+
108
+ def get_metadata(self) -> ScenarioMetadata:
109
+ return ScenarioMetadata(
110
+ name="legal_support",
111
+ display_name="LegalSupport",
112
+ description="Scenario introduced in this work to measure fine-grained legal reasoning "
113
+ "through reverse entailment.",
114
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
115
+ main_metric="quasi_exact_match",
116
+ main_split="test",
117
+ )
@@ -5,6 +5,7 @@ import datasets
5
5
  from pathlib import Path
6
6
  from typing import List, Dict
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
9
10
  from helm.benchmark.scenarios.scenario import (
10
11
  Scenario,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  TEST_SPLIT,
16
17
  Input,
17
18
  Output,
19
+ ScenarioMetadata,
18
20
  )
19
21
 
20
22
  PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
@@ -144,3 +146,20 @@ class LegalBenchScenario(Scenario):
144
146
  instances.append(instance)
145
147
 
146
148
  return instances
149
+
150
+ def get_metadata(self) -> ScenarioMetadata:
151
+ return ScenarioMetadata(
152
+ name=self.name,
153
+ display_name="LegalBench",
154
+ description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
155
+ "tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
156
+ taxonomy=TaxonomyInfo(
157
+ task="multiple-choice question answering",
158
+ what="public legal and admininstrative documents, manually " "constructed questions",
159
+ when="before 2023",
160
+ who="lawyers",
161
+ language="English",
162
+ ),
163
+ main_metric="quasi_exact_match",
164
+ main_split="test",
165
+ )
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
16
16
  TEST_SPLIT,
17
17
  Input,
18
18
  Output,
19
+ ScenarioMetadata,
19
20
  )
20
21
 
21
22
  ECTHR_A = "ecthr_a"
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
261
262
  for subset in self.subsets:
262
263
  instances.extend(self.get_instances_for_subset(subset, output_path))
263
264
  return instances
265
+
266
+ def get_metadata(self) -> ScenarioMetadata:
267
+ return ScenarioMetadata(
268
+ name="lex_glue",
269
+ display_name="LexGLUE",
270
+ description="A Benchmark Dataset for Legal Language Understanding in English",
271
+ taxonomy=None,
272
+ main_metric="classification_macro_f1",
273
+ main_split="test",
274
+ )
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
16
16
  TEST_SPLIT,
17
17
  Output,
18
18
  Input,
19
+ ScenarioMetadata,
19
20
  )
20
21
 
21
22
 
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
466
467
  for subset in self.subsets:
467
468
  instances.extend(self.get_instances_for_subset(subset, output_path))
468
469
  return instances
470
+
471
+ def get_metadata(self) -> ScenarioMetadata:
472
+ return ScenarioMetadata(
473
+ name="lextreme",
474
+ display_name="LEXTREME",
475
+ description="A Multilingual Legal Benchmark for Natural Language Understanding",
476
+ taxonomy=None,
477
+ main_metric="classification_macro_f1",
478
+ main_split="test",
479
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import json
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  PassageQuestionInput,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -143,3 +145,15 @@ class LSATScenario(Scenario):
143
145
  instances.append(instance)
144
146
 
145
147
  return instances
148
+
149
+ def get_metadata(self) -> ScenarioMetadata:
150
+ return ScenarioMetadata(
151
+ name="lsat_qa",
152
+ display_name="LSAT",
153
+ description="The LSAT benchmark for measuring analytical reasoning on the Law School "
154
+ "Admission Test (LSAT; [Zhong et al., "
155
+ "2021](https://arxiv.org/pdf/2104.06598.pdf)).",
156
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
157
+ main_metric="quasi_exact_match",
158
+ main_split="test",
159
+ )
@@ -0,0 +1,73 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ TRAIN_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
17
+
18
+
19
+ class MadinahQAScenario(Scenario):
20
+ """MadinahQA Scenario"""
21
+
22
+ name = "madinah_qa"
23
+ description = "Arabic language competency benchmark"
24
+ tags = ["language", "multiple_choice"]
25
+
26
+ OPTIONS = ["A", "B", "C", "D"]
27
+ HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
28
+ SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"]
29
+
30
+ def __init__(self, subset: str):
31
+ super().__init__()
32
+ subset = subset.replace("_", " ")
33
+ if subset not in self.SUBSETS:
34
+ raise Exception(f"Unknown subset: {subset}")
35
+ self.subset = subset
36
+
37
+ def get_instances(self, output_path: str) -> List[Instance]:
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+ instances: List[Instance] = []
41
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
42
+ "MBZUAI/MadinahQA",
43
+ self.subset,
44
+ revision="62e7c86ac5c07245a5a952722691d77ddb41f695",
45
+ cache_dir=cache_dir,
46
+ )
47
+
48
+ # Read all instances
49
+ for split_name, dataset in dataset_splits.items():
50
+ assert isinstance(dataset, datasets.Dataset)
51
+ for row_index, row in enumerate(dataset):
52
+ input = Input(text=row["Question"])
53
+ references: List[Reference] = []
54
+ correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
55
+ for option_index in range(1, 6):
56
+ column_name = f"Option {option_index}"
57
+ if not row[column_name]:
58
+ continue
59
+ references.append(
60
+ Reference(
61
+ output=Output(text=row[column_name]),
62
+ tags=[CORRECT_TAG] if option_index == correct_option_index else [],
63
+ )
64
+ )
65
+ instance = Instance(
66
+ id=f"id{row_index}",
67
+ input=input,
68
+ references=references,
69
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
70
+ )
71
+ instances.append(instance)
72
+
73
+ return instances
@@ -4,6 +4,7 @@ import typing
4
4
  from typing import Dict, List, Optional
5
5
  from datasets import load_dataset, DatasetDict
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
@@ -450,3 +452,34 @@ class MATHScenario(Scenario):
450
452
  instances.append(instance)
451
453
 
452
454
  return instances
455
+
456
+ def get_metadata(self) -> ScenarioMetadata:
457
+ taxonomy = TaxonomyInfo(
458
+ task="numeric answer question answering",
459
+ what="math competitions (AMC, AIME, etc.)",
460
+ when="before 2021",
461
+ who="problem setters",
462
+ language="synthetic",
463
+ )
464
+ if self.use_chain_of_thought:
465
+ return ScenarioMetadata(
466
+ name="math_chain_of_thought",
467
+ display_name="MATH",
468
+ description="The MATH benchmark for measuring mathematical problem solving on competition "
469
+ "math problems with chain-of-thought style reasoning [(Hendrycks et al., "
470
+ "2021)](https://arxiv.org/pdf/2103.03874.pdf).",
471
+ taxonomy=taxonomy,
472
+ main_metric="math_equiv_chain_of_thought",
473
+ main_split="test",
474
+ )
475
+ else:
476
+ return ScenarioMetadata(
477
+ name="math_regular",
478
+ display_name="MATH",
479
+ description="The MATH benchmark for measuring mathematical problem solving on competition "
480
+ "math problems [(Hendrycks et al., "
481
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
482
+ taxonomy=taxonomy,
483
+ main_metric="math_equiv",
484
+ main_split="test",
485
+ )
@@ -0,0 +1,68 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MBZUAIHumanTranslatedArabicMMLUScenario(Scenario):
19
+ """MBZUAI Human-Translated Arabic MMLU
20
+
21
+ A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark from this paper:
22
+
23
+ - https://arxiv.org/pdf/2009.03300.pdf
24
+ """ # noqa: E501
25
+
26
+ name = "mbzuai_human_translated_arabic_mmlu"
27
+ description = (
28
+ "A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark"
29
+ )
30
+ tags = ["knowledge", "multiple_choice"]
31
+
32
+ def __init__(self, subject: str):
33
+ super().__init__()
34
+ self.subject: str = subject
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ cache_dir = os.path.join(output_path, "data")
38
+ ensure_directory_exists(cache_dir)
39
+ dataset = datasets.load_dataset(
40
+ "MBZUAI/human_translated_arabic_mmlu",
41
+ self.subject,
42
+ revision="5ed7830fd678cfa6f2d7f0a1a13a4e1a1fa422ac",
43
+ cache_dir=cache_dir,
44
+ split="test",
45
+ )
46
+ assert isinstance(dataset, datasets.Dataset)
47
+
48
+ # Read all instances
49
+ instances: List[Instance] = []
50
+ for row_index, row in enumerate(dataset):
51
+ input = Input(text=row["question"])
52
+ references: List[Reference] = []
53
+ for choice_index, choice in enumerate(row["choices"]):
54
+ references.append(
55
+ Reference(
56
+ output=Output(text=choice),
57
+ tags=[CORRECT_TAG] if choice_index == row["answer"] else [],
58
+ )
59
+ )
60
+ instance = Instance(
61
+ id=f"id-{self.subject}-{row_index}",
62
+ input=input,
63
+ references=references,
64
+ split=TEST_SPLIT,
65
+ )
66
+ instances.append(instance)
67
+
68
+ return instances
@@ -2,8 +2,18 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, Input, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ ALL_SPLITS,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ ScenarioMetadata,
16
+ )
7
17
 
8
18
 
9
19
  class MedDialogScenario(Scenario):
@@ -133,3 +143,24 @@ class MedDialogScenario(Scenario):
133
143
  )
134
144
 
135
145
  return instances
146
+
147
+ def get_metadata(self):
148
+ return ScenarioMetadata(
149
+ name="med_dialog",
150
+ display_name="MedDialog",
151
+ short_display_name="MedDialog",
152
+ description="MedDialog is a benchmark of real-world doctor-patient conversations focused on "
153
+ "health-related concerns and advice. Each dialogue is paired with a "
154
+ "one-sentence summary that reflects the core patient question or exchange. The "
155
+ "benchmark evaluates a model's ability to condense medical dialogue into "
156
+ "concise, informative summaries.",
157
+ taxonomy=TaxonomyInfo(
158
+ task="Text generation",
159
+ what="Generate summaries of doctor-patient conversations",
160
+ when="Any",
161
+ who="Clinician",
162
+ language="English",
163
+ ),
164
+ main_metric="med_dialog_accuracy",
165
+ main_split="test",
166
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  VALID_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -109,3 +111,15 @@ class MedMCQAScenario(Scenario):
109
111
  instances.append(instance)
110
112
 
111
113
  return instances
114
+
115
+ def get_metadata(self):
116
+ return ScenarioMetadata(
117
+ name="med_mcqa",
118
+ display_name="MedMCQA",
119
+ description='MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to '
120
+ "address real-world medical entrance exam questions ([Flores et al. "
121
+ "2020](https://arxiv.org/abs/2203.14371)).",
122
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
123
+ main_metric="exact_match",
124
+ main_split="valid",
125
+ )