crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,152 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from datasets import load_dataset, Dataset
4
+ from helm.common.hierarchical_logger import htrack_block
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ VALID_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MELTTranslationScenario(Scenario):
19
+ name = "melt_translation"
20
+ description = "Machine Translation scenario."
21
+ tags = ["machine_translation"]
22
+
23
+ def __init__(
24
+ self,
25
+ dataset_name: str,
26
+ revision: str,
27
+ source_language: str,
28
+ target_language: str,
29
+ subset: Optional[str] = None,
30
+ splits: Optional[Dict[str, str]] = None,
31
+ ):
32
+ """Initializes the question answering scenario.
33
+
34
+ Args:
35
+ dataset_name: The name of the dataset.
36
+ revision: The revision of the dataset to use.
37
+ source_language: The source language to use.
38
+ target_language: The target language to use.
39
+ subset: The subset of the dataset to use. Defaults to "".
40
+ splits: The splits to use for the dataset. Defaults to None.
41
+ """
42
+ super().__init__()
43
+ self.MAX_TRAIN_INSTANCES = 20_000
44
+ valid_languages = set(["vi", "en"])
45
+ self.dataset_name = dataset_name
46
+ self.subset = subset
47
+ self.revision = revision
48
+ self.splits = splits
49
+ self.source_language = source_language
50
+ self.target_language = target_language
51
+ if self.source_language not in valid_languages or self.target_language not in valid_languages:
52
+ raise ValueError("Supported languages: vi, en.")
53
+ if self.source_language == self.target_language:
54
+ raise ValueError("The source language and the target language should be different.")
55
+ if self.source_language != "en" and self.target_language != "en":
56
+ raise ValueError("One of the languages should be English.")
57
+
58
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
59
+ """
60
+ Helper for generating instances for a split.
61
+ Args:
62
+ splits (dict): Which splits to partition the data into.
63
+ Returns:
64
+ List[Instance]: Instances from the file for the specified split.
65
+ """
66
+ with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
67
+ hf_dataset: Any = load_dataset(
68
+ self.dataset_name,
69
+ self.subset,
70
+ revision=self.revision,
71
+ trust_remote_code=True,
72
+ )
73
+
74
+ instances: List[Instance] = []
75
+
76
+ for dataset_split_name, helm_split_name in splits.items():
77
+ if helm_split_name == TRAIN_SPLIT:
78
+ hf_dataset[dataset_split_name] = hf_dataset[dataset_split_name].shuffle(seed=42)[
79
+ : self.MAX_TRAIN_INSTANCES
80
+ ]
81
+ hf_dataset[dataset_split_name] = Dataset.from_dict(hf_dataset[dataset_split_name])
82
+
83
+ for example in hf_dataset[dataset_split_name]:
84
+ source_sentence = example[self.source_language]
85
+ target_sentence = example[self.target_language]
86
+ instances.append(
87
+ Instance(
88
+ input=Input(text=source_sentence),
89
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
90
+ split=helm_split_name,
91
+ )
92
+ )
93
+ return instances
94
+
95
+ def get_instances(self, output_path: str) -> List[Instance]:
96
+ if self.splits is None:
97
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
98
+ else:
99
+ splits = {}
100
+ if "train" in self.splits:
101
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
102
+ if "validation" in self.splits:
103
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
104
+ if "test" in self.splits:
105
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
106
+
107
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
108
+ return instances
109
+
110
+
111
+ class MELTTranslationOPUS100Scenario(MELTTranslationScenario):
112
+ """
113
+ Scenario for the OPUS100 dataset.
114
+ """
115
+
116
+ name = "melt_translation_opus100"
117
+ description = "OPUS100 dataset for machine translation."
118
+ tags = ["machine_translation"]
119
+
120
+ def __init__(self, **kwargs):
121
+ super().__init__(
122
+ dataset_name="vietgpt/opus100_envi",
123
+ revision="45df06fb0b31edc882d7c8d34389261f995e5208",
124
+ splits={
125
+ TRAIN_SPLIT: "train",
126
+ VALID_SPLIT: "validation",
127
+ TEST_SPLIT: "test",
128
+ },
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ class MELTTranslationPhoMTScenario(MELTTranslationScenario):
134
+ """
135
+ Scenario for the PhoMT dataset.
136
+ """
137
+
138
+ name = "melt_translation_phomt"
139
+ description = "PhoMT dataset for machine translation."
140
+ tags = ["machine_translation"]
141
+
142
+ def __init__(self, **kwargs):
143
+ super().__init__(
144
+ dataset_name="ura-hcmut/PhoMT",
145
+ revision="74386685db01dc038860ff0a90d9f5fbde284bf7",
146
+ splits={
147
+ TRAIN_SPLIT: "train",
148
+ VALID_SPLIT: "validation",
149
+ TEST_SPLIT: "test",
150
+ },
151
+ **kwargs,
152
+ )
@@ -0,0 +1,123 @@
1
+ import pandas as pd
2
+ from typing import List
3
+ from helm.benchmark.scenarios.scenario import (
4
+ Scenario,
5
+ Instance,
6
+ Reference,
7
+ TEST_SPLIT,
8
+ CORRECT_TAG,
9
+ PassageQuestionInput,
10
+ Output,
11
+ )
12
+ from helm.common.general import check_file_exists
13
+
14
+
15
+ class MentalHealthScenario(Scenario):
16
+ """
17
+ This scenario evaluates language models' ability to generate appropriate counseling responses
18
+ in mental health conversations. The dataset contains counseling dialogues covering
19
+ various topics including workplace issues, anxiety, suicidal thoughts, relationship
20
+ problems, and more.
21
+
22
+ Each dialogue consists of interactions between a counselor and a client, where the counselor
23
+ demonstrates expert mental health counseling techniques. The dialogues were selected based on high
24
+ quality scores from multiple evaluators.
25
+
26
+ Example dialogue structure:
27
+ ```
28
+ counselor: Hi there, to start can you tell me your name and a little bit about what's been going on?
29
+ client: I sleep too much... I'm 23, female and work as IT professional. I feel like I'm not fitting in...
30
+ counselor: I can see you have been facing challenges with feeling like you don't fit in...
31
+ ```
32
+
33
+ The task is to generate the next counselor response given the conversation history. Models
34
+ are evaluated on their ability to:
35
+ 1. Provide empathetic and supportive responses
36
+ 2. Follow proper mental health counseling protocols
37
+ 3. Generate contextually appropriate interventions
38
+
39
+ The dataset includes:
40
+ - 7 complete dialogues covering different mental health topics
41
+ - Metadata about dialogue topic and type
42
+ - Gold-standard counselor responses as references
43
+ - Full conversation history for context
44
+
45
+ Each instance includes:
46
+ - input: Previous conversation turns formatted with speaker labels
47
+ - reference: The actual counselor's response (gold standard)
48
+ - metadata: Topic and type of mental health conversation
49
+ """
50
+
51
+ name = "mental_health"
52
+ description = (
53
+ "MentalHealth is a benchmark focused on evaluating empathetic communication in"
54
+ "mental health counseling. It includes simulated conversations between patients"
55
+ "and counselors, where the task is to generate compassionate and appropriate counselor"
56
+ "responses. The benchmark assesses a model's ability to support patients emotionally"
57
+ "and meaningfully engage in therapeutic conversations."
58
+ )
59
+ tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
60
+
61
+ def __init__(self, data_path: str):
62
+ super().__init__()
63
+ self.data_path = data_path
64
+
65
+ def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
66
+ """
67
+ Process the dialogue data into evaluation instances.
68
+ Each instance represents a point in the conversation where the model needs to generate
69
+ a counselor response.
70
+
71
+ Args:
72
+ data (pd.DataFrame): DataFrame containing processed dialogues with columns:
73
+ - context: Previous conversation history
74
+ - gold_counselor_response: Reference counselor response
75
+ - topic: Type of conversation (e.g., "Anxiety", "Workplace")
76
+ - dialogue_type: Numerical type identifier
77
+
78
+ Returns:
79
+ List[Instance]: List of processed instances ready for evaluation
80
+ """
81
+ instances: List[Instance] = []
82
+
83
+ for _, row in data.iterrows():
84
+ # Format input with clear section breaks and instructions
85
+ input_text = (
86
+ f"Topic: {row['topic']}\n"
87
+ f"Type: {row['dialogue_type']}\n"
88
+ f"\nPrevious conversation:\n{row['context']}\n"
89
+ f"\nGenerate an empathetic and appropriate counselor response:"
90
+ )
91
+
92
+ # Create input with empty passage since all context is in question
93
+ prompt = PassageQuestionInput(passage="", question=input_text)
94
+
95
+ # Create instance with gold standard response
96
+ instance = Instance(
97
+ input=prompt,
98
+ references=[Reference(Output(text=row["gold_counselor_response"]), tags=[CORRECT_TAG])],
99
+ split=TEST_SPLIT,
100
+ )
101
+ instances.append(instance)
102
+
103
+ return instances
104
+
105
+ def get_instances(self, output_path: str) -> List[Instance]:
106
+ """
107
+ Load and process the mental health dialogue dataset.
108
+
109
+ Args:
110
+ output_path (str): Path for any cached or intermediate files
111
+
112
+ Returns:
113
+ List[Instance]: List of processed instances for evaluation
114
+ """
115
+ check_file_exists(
116
+ self.data_path, msg=f"[MentalHealthScenario] Required data file not found: '{self.data_path}'"
117
+ )
118
+ dialogue_data = pd.read_csv(self.data_path)
119
+
120
+ # Process into instances
121
+ instances = self.process_dialogue_data(dialogue_data)
122
+
123
+ return instances
@@ -0,0 +1,103 @@
1
+ import json
2
+ from typing import Dict, List
3
+
4
+ from helm.common.general import check_file_exists
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+
16
+ class MIMICBHCScenario(Scenario):
17
+ """
18
+ MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
19
+ course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
20
+
21
+ In total, the dataset contains 270,033 clinical notes.
22
+ The splits are provided by the dataset itself.
23
+
24
+ Sample Synthetic Prompt:
25
+ Summarize the clinical note into a brief hospital course.
26
+
27
+ Clinical Note:
28
+ <SEX> M <SERVICE> SURGERY <ALLERGIES> No Known Allergies \/ Adverse Drug Reactions
29
+ ...
30
+ continue to follow-up with your health care providers as an outpatient.
31
+
32
+ Brief Hospital Course:
33
+ Mr. ___ was pre-admitted on ___ for liver transplantation
34
+ ...
35
+ discharged home to continue home medications and follow-up as an outpatient.
36
+
37
+ @article{aali2024dataset,
38
+ title={A dataset and benchmark for hospital course summarization with adapted large language models},
39
+ author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
40
+ and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
41
+ and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
42
+ journal={Journal of the American Medical Informatics Association},
43
+ volume={32},
44
+ number={3},
45
+ pages={470--479},
46
+ year={2024},
47
+ publisher={Oxford University Press}
48
+ }
49
+
50
+ @article{aali2024mimic,
51
+ title={MIMIC-IV-Ext-BHC: Labeled Clinical Notes Dataset for Hospital Course Summarization},
52
+ author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
53
+ and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
54
+ and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
55
+ journal={PhysioNet},
56
+ year={2024}
57
+ }
58
+ """
59
+
60
+ name = "mimic_bhc"
61
+ description = (
62
+ "MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief"
63
+ "Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV,"
64
+ "each paired with its corresponding BHC summary. The benchmark evaluates a model's"
65
+ "ability to condense detailed clinical information into accurate, concise summaries that"
66
+ "reflect the patient's hospital stay."
67
+ )
68
+ tags = ["summarization", "biomedical"]
69
+
70
+ def __init__(self, data_path: str):
71
+ super().__init__()
72
+ self.data_path = data_path
73
+
74
+ def get_instances(self, output_path: str) -> List[Instance]:
75
+ check_file_exists(self.data_path, msg=f"[MIMICBHCScenario] Required data file not found: '{self.data_path}'")
76
+
77
+ instances: List[Instance] = []
78
+ # Limit to zero shot setting for now
79
+ splits: Dict[str, str] = {
80
+ # "train": TRAIN_SPLIT,
81
+ # "validate": VALID_SPLIT,
82
+ "test": TEST_SPLIT,
83
+ }
84
+
85
+ with open(self.data_path, "r") as f:
86
+ data = [json.loads(line) for line in f]
87
+
88
+ for data_split, split in splits.items():
89
+ clinical_notes: List[str] = [x["input"] for x in data]
90
+ bhc_summaries: List[str] = [x["target"] for x in data]
91
+ assert len(clinical_notes) == len(bhc_summaries), "Notes and summaries must have the same length"
92
+ for clinical_note, bhc_summary in zip(clinical_notes, bhc_summaries):
93
+ if not clinical_note or not bhc_summary:
94
+ continue
95
+ instances.append(
96
+ Instance(
97
+ input=Input(text=clinical_note),
98
+ references=[Reference(Output(text=bhc_summary), tags=[CORRECT_TAG])],
99
+ split=split,
100
+ )
101
+ )
102
+
103
+ return instances
@@ -0,0 +1,98 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ from helm.common.general import check_file_exists
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+
16
+ class MIMICRRSScenario(Scenario):
17
+ """
18
+ MIMIC-RRS is a biomedical question answering (QA) dataset collected from MIMIC-III and MIMIC-CXR
19
+ radiology reports.
20
+ In this scenario, we only consider the radiology reports from MIMIC-III.
21
+ In total, the dataset contains 73,259 reports.
22
+ The splits are provided by the dataset itself.
23
+
24
+ Sample Synthetic Prompt:
25
+ Generate the impressions of a radiology report based on its findings.
26
+
27
+ Findings:
28
+ The heart is normal in size. The lungs are clear.
29
+
30
+ Impressions:
31
+
32
+ @inproceedings{Chen_2023,
33
+ title={Toward Expanding the Scope of Radiology Report Summarization to Multiple Anatomies and Modalities},
34
+ url={http://dx.doi.org/10.18653/v1/2023.acl-short.41},
35
+ DOI={10.18653/v1/2023.acl-short.41},
36
+ booktitle={Proceedings of the 61st Annual Meeting of the Association
37
+ for Computational Linguistics (Volume 2: Short Papers)},
38
+ publisher={Association for Computational Linguistics},
39
+ author={Chen, Zhihong and Varma, Maya and Wan, Xiang and Langlotz, Curtis and Delbrouck, Jean-Benoit},
40
+ year={2023},
41
+ pages={469–484}
42
+ }
43
+ """
44
+
45
+ name = "mimic_rrs"
46
+ description = (
47
+ "MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III"
48
+ "database. It contains pairs of 'Findings' and 'Impression' sections, enabling evaluation"
49
+ "of a model's ability to summarize diagnostic imaging observations into concise, clinically"
50
+ "relevant conclusions."
51
+ )
52
+ tags = ["question_answering", "biomedical"]
53
+
54
+ def __init__(self, data_path: str):
55
+ super().__init__()
56
+ self.data_path = data_path
57
+
58
+ def get_instances(self, output_path: str) -> List[Instance]:
59
+ instances: List[Instance] = []
60
+ # Limit to zero shot setting for now
61
+ splits: Dict[str, str] = {
62
+ # "train": TRAIN_SPLIT,
63
+ # "validate": VALID_SPLIT,
64
+ "test": TEST_SPLIT,
65
+ }
66
+
67
+ for data_split, split in splits.items():
68
+ split_findings_name: str = f"{data_split}.findings.tok"
69
+ split_impressions_name: str = f"{data_split}.impression.tok"
70
+ findings_path: str = os.path.join(self.data_path, split_findings_name)
71
+ impressions_path: str = os.path.join(self.data_path, split_impressions_name)
72
+ check_file_exists(
73
+ findings_path, msg=f"[MIMICRRSScenario] Required findings file not found: '{findings_path}'"
74
+ )
75
+ check_file_exists(
76
+ impressions_path, msg=f"[MIMICRRSScenario] Required impressions file not found: '{impressions_path}'"
77
+ )
78
+ findings: List[str] = self.read_file(findings_path)
79
+ impressions: List[str] = self.read_file(impressions_path)
80
+ assert len(findings) == len(impressions), "Findings and impressions must have the same length"
81
+ for finding, impression in zip(findings, impressions):
82
+ if not finding or not impression:
83
+ continue
84
+ instances.append(
85
+ Instance(
86
+ input=Input(text=finding),
87
+ references=[Reference(Output(text=impression), tags=[CORRECT_TAG])],
88
+ split=split,
89
+ )
90
+ )
91
+
92
+ return instances
93
+
94
+ def read_file(self, file_path: str) -> List[str]:
95
+ with open(file_path, "r") as file:
96
+ lines = file.readlines()
97
+ lines = [line.strip() for line in lines]
98
+ return lines
@@ -0,0 +1,77 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import List
4
+
5
+ from helm.common.general import check_file_exists
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Input,
8
+ Scenario,
9
+ Instance,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Reference,
13
+ Output,
14
+ )
15
+
16
+
17
+ class MIMICIVBillingCodeScenario(Scenario):
18
+ """
19
+ A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
20
+
21
+ - Input: The clinical note (column "text").
22
+ - Output: The list of ICD-10 codes (column "target").
23
+ """
24
+
25
+ name = "mimiciv_billing_code"
26
+ description = (
27
+ "MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the"
28
+ "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task"
29
+ "requires models to extract structured billing codes based on free-text clinical notes,"
30
+ "reflecting real-world hospital coding tasks for financial reimbursement."
31
+ )
32
+ tags = ["question_answering", "biomedical"]
33
+
34
+ def __init__(self, data_path: str):
35
+ """
36
+ :param data_path: Path to the mimiciv_icd10.feather file.
37
+ """
38
+ super().__init__()
39
+ self.data_path = data_path
40
+
41
+ def get_instances(self, output_path: str) -> List[Instance]:
42
+ check_file_exists(
43
+ self.data_path, msg=f"[MIMICIVBilligCodeScenario] Required data file not found: '{self.data_path}'"
44
+ )
45
+
46
+ # Read the preprocessed MIMIC-IV data (.feather format)
47
+ df = pd.read_feather(self.data_path) # columns: ["text", "target", ...]
48
+
49
+ instances: List[Instance] = []
50
+
51
+ # Use the entire dataset as one split (TEST_SPLIT)
52
+ for idx, row in df.iterrows():
53
+ try:
54
+ note_text: str = row["text"]
55
+ icd10_codes = row["target"]
56
+
57
+ # Convert numpy array to list if necessary
58
+ if isinstance(icd10_codes, np.ndarray):
59
+ icd10_codes = icd10_codes.tolist()
60
+ elif not isinstance(icd10_codes, list):
61
+ icd10_codes = [str(icd10_codes)] # Handle single values
62
+
63
+ # Convert all codes to strings and join
64
+ codes_str = ",".join(str(code) for code in icd10_codes)
65
+
66
+ # Create one Instance per row
67
+ instance = Instance(
68
+ input=Input(text=note_text),
69
+ references=[Reference(Output(text=codes_str), tags=[CORRECT_TAG])],
70
+ split=TEST_SPLIT,
71
+ )
72
+ instances.append(instance)
73
+ except Exception as e:
74
+ print(f"Error processing row {idx}: {str(e)}")
75
+ continue
76
+
77
+ return instances
@@ -0,0 +1,74 @@
1
+ import csv
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from helm.common.hierarchical_logger import hlog
7
+ from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
8
+
9
+
10
+ class MMLU_Clinical_Afr_Scenario(Scenario):
11
+ """
12
+ https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages
13
+ """
14
+
15
+ name = "mmlu_clinical_afr"
16
+ description = "Massive Multitask Language Understanding (MMLU) translated into 11 African low-resource languages"
17
+ tags = ["knowledge", "multiple_choice", "low_resource_languages"]
18
+
19
+ def __init__(self, subject: str = "clinical_knowledge", lang: str = "af"):
20
+ super().__init__()
21
+ self.subject: str = subject
22
+ self.lang: str = lang
23
+
24
+ def download_mmlu_clinical_afr(self, path: str):
25
+ ensure_file_downloaded(
26
+ source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", # noqa: E501
27
+ target_path=path,
28
+ unpack=True,
29
+ unpack_type="unzip",
30
+ )
31
+
32
+ def process_csv(self, csv_path: str, split: str) -> List[Instance]:
33
+ instances: List[Instance] = []
34
+ hlog(f"Reading {csv_path}")
35
+ with open(csv_path) as f:
36
+ reader = csv.reader(f, delimiter=",")
37
+ for row in reader:
38
+
39
+ question, answers, correct_choice = row[0], row[1:-1], row[-1]
40
+ answers_dict = dict(zip(["A", "B", "C", "D"], answers))
41
+ correct_answer: str = answers_dict[correct_choice]
42
+
43
+ def answer_to_reference(answer: str) -> Reference:
44
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
45
+
46
+ instance = Instance(
47
+ input=Input(text=question),
48
+ references=list(map(answer_to_reference, answers)),
49
+ split=split,
50
+ )
51
+ instances.append(instance)
52
+ return instances
53
+
54
+ def get_instances(self, output_path: str) -> List[Instance]:
55
+ # Download the raw data
56
+ desired_dir = "mmlu_cm_ck_vir"
57
+ data_path: str = os.path.join(output_path, desired_dir)
58
+ self.download_mmlu_clinical_afr(data_path)
59
+
60
+ # Read all the instances
61
+ instances: List[Instance] = []
62
+ splits: Dict[str, str] = {
63
+ "dev": TRAIN_SPLIT,
64
+ "val": VALID_SPLIT,
65
+ "test": TEST_SPLIT,
66
+ }
67
+ for split in splits:
68
+ csv_path: str = os.path.join(data_path, desired_dir, f"{self.subject}_{split}_{self.lang}.csv")
69
+ if not os.path.exists(csv_path):
70
+ hlog(f"{csv_path} doesn't exist, skipping")
71
+ continue
72
+ instances.extend(self.process_csv(csv_path, splits[split]))
73
+
74
+ return instances