crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -112,9 +112,13 @@ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
112
112
  )
113
113
  adapter_spec = _get_generation_adapter_spec(
114
114
  instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
115
- max_tokens=10,
115
+ max_tokens=50,
116
116
  )
117
- metric_specs: List[MetricSpec] = audio_classification_metric_specs()
117
+ metric_specs: List[MetricSpec] = [
118
+ MetricSpec(
119
+ class_name="helm.benchmark.metrics.ultra_suite_asr_classification_metrics.UltraSuiteASRMetric", args={}
120
+ )
121
+ ]
118
122
  run_spec_name: str = "ultra_suite_asr_classification"
119
123
  return RunSpec(
120
124
  name=run_spec_name,
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  TEST_SPLIT,
11
12
  Input,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import ensure_file_downloaded
15
17
 
@@ -124,3 +126,24 @@ class ACIBenchScenario(Scenario):
124
126
  instances.extend(self.process_json(test_json, TEST_SPLIT))
125
127
 
126
128
  return instances
129
+
130
+ def get_metadata(self):
131
+ return ScenarioMetadata(
132
+ name="aci_bench",
133
+ display_name="ACI-Bench",
134
+ description="ACI-Bench is a benchmark of real-world patient-doctor conversations paired "
135
+ "with structured clinical notes. The benchmark evaluates a model's ability to "
136
+ "understand spoken medical dialogue and convert it into formal clinical "
137
+ "documentation, covering sections such as history of present illness, physical "
138
+ "exam findings, results, and assessment and plan [(Yim et al., "
139
+ "2024)](https://www.nature.com/articles/s41597-023-02487-3).",
140
+ taxonomy=TaxonomyInfo(
141
+ task="Text generation",
142
+ what="Extract and structure information from patient-doctor " "conversations",
143
+ when="Any",
144
+ who="Clinician",
145
+ language="English",
146
+ ),
147
+ main_metric="aci_bench_accuracy",
148
+ main_split="test",
149
+ )
@@ -2,6 +2,7 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  TEST_SPLIT,
10
11
  Input,
11
12
  Output,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_directory_exists
14
16
 
@@ -53,3 +55,22 @@ class AIRBench2024Scenario(Scenario):
53
55
  instance = Instance(input=input, references=references, split=TEST_SPLIT)
54
56
  instances.append(instance)
55
57
  return instances
58
+
59
+ def get_metadata(self) -> ScenarioMetadata:
60
+ return ScenarioMetadata(
61
+ name="air_bench_2024",
62
+ display_name="AIRBench 2024",
63
+ description="AIRBench 2024 is a AI safety benchmark that aligns with emerging government "
64
+ "regulations and company policies. It consists of diverse, malicious prompts "
65
+ "spanning categories of the regulation-based safety categories in the AIR 2024 "
66
+ "safety taxonomy.\n",
67
+ taxonomy=TaxonomyInfo(
68
+ task="open-ended instruction-following text generation",
69
+ what="malicious prompts",
70
+ when="2024",
71
+ who="dataset authors and language models",
72
+ language="English",
73
+ ),
74
+ main_metric="air_score",
75
+ main_split="test",
76
+ )
@@ -0,0 +1,54 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class ALRAGEScenario(Scenario):
19
+ """ALRAGE""" # noqa: E501
20
+
21
+ name = "alrage"
22
+ description = "ALRAGE"
23
+ tags = ["open-book question answering"]
24
+
25
+ def get_instances(self, output_path: str) -> List[Instance]:
26
+ cache_dir = os.path.join(output_path, "data")
27
+ ensure_directory_exists(cache_dir)
28
+ dataset: datasets.Dataset = datasets.load_dataset(
29
+ "OALL/ALRAGE",
30
+ revision="4827b2ed2436aea578e84d9bd4150b66ab8bbe0e",
31
+ split="train",
32
+ cache_dir=cache_dir,
33
+ )
34
+
35
+ # Read all instances
36
+ instances: List[Instance] = []
37
+ for row in dataset:
38
+ input = Input(text=f"السؤال:\n{row['question']}\n\nالسياقات المقترحة:\n{row['candidates']}\n")
39
+ references: List[Reference] = []
40
+ references = [
41
+ Reference(
42
+ output=Output(text=row["gold_answer"]),
43
+ tags=[CORRECT_TAG],
44
+ )
45
+ ]
46
+ instance = Instance(
47
+ id=row["id"],
48
+ input=input,
49
+ references=references,
50
+ split=TEST_SPLIT,
51
+ )
52
+ instances.append(instance)
53
+
54
+ return instances
@@ -2,7 +2,8 @@ import re
2
2
  from typing import List, Any, Dict
3
3
  from datasets import load_dataset
4
4
 
5
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
6
7
 
7
8
 
8
9
  class AnthropicHHRLHFScenario(Scenario):
@@ -88,3 +89,24 @@ class AnthropicHHRLHFScenario(Scenario):
88
89
  )
89
90
  instances.append(instance)
90
91
  return instances
92
+
93
+ def get_metadata(self) -> ScenarioMetadata:
94
+ return ScenarioMetadata(
95
+ name="anthropic_hh_rlhf",
96
+ display_name="Anthropic RLHF dataset",
97
+ short_display_name="Anthropic RLHF dataset",
98
+ description="The dialogue datasets released by Anthropic to facilitate research in model "
99
+ "helpfulness and harmlessness ([Bai et al., "
100
+ "2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., "
101
+ "2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance "
102
+ "of each dialogue.",
103
+ taxonomy=TaxonomyInfo(
104
+ task="open-ended instruction following",
105
+ what="Human-LM dialogues and preference labels",
106
+ when="2022",
107
+ who="Workers from MTurk and Upwork, language models from Anthropic",
108
+ language="English",
109
+ ),
110
+ main_metric="Helpfulness",
111
+ main_split="test",
112
+ )
@@ -2,7 +2,8 @@ import re
2
2
  from typing import List, Any, Dict
3
3
  from datasets import load_dataset
4
4
 
5
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
6
7
 
7
8
 
8
9
  class AnthropicRedTeamScenario(Scenario):
@@ -69,3 +70,13 @@ class AnthropicRedTeamScenario(Scenario):
69
70
  )
70
71
  instances.append(instance)
71
72
  return instances
73
+
74
+ def get_metadata(self) -> ScenarioMetadata:
75
+ return ScenarioMetadata(
76
+ name="anthropic_red_team",
77
+ display_name="Anthropic Red Team",
78
+ description="Anthropic Red Team",
79
+ taxonomy=TaxonomyInfo(task="instruction following sfaety", what="?", when="?", who="?", language="English"),
80
+ main_metric="safety_score",
81
+ main_split="test",
82
+ )
@@ -0,0 +1,114 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ TRAIN_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
17
+ from helm.common.hierarchical_logger import hwarn
18
+
19
+
20
+ class ArabicEXAMSScenario(Scenario):
21
+ """The Arabic subset of the EXAMS High School Examinations Dataset for Multilingual Question Answering
22
+
23
+ We use the Open Arabic LLM Leaderboard (OALL) version mirror of the Arabic subset of EXAMS, which is in-turn based
24
+ on the AceGPT version.
25
+
26
+ See: https://www.tii.ae/news/introducing-open-arabic-llm-leaderboard-empowering-arabic-language-modeling-community
27
+
28
+ References:
29
+
30
+ ```
31
+ @misc{huang2024acegptlocalizinglargelanguage,
32
+ title={AceGPT, Localizing Large Language Models in Arabic},
33
+ author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Juncai He and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
34
+ year={2024},
35
+ eprint={2309.12053},
36
+ archivePrefix={arXiv},
37
+ primaryClass={cs.CL},
38
+ url={https://arxiv.org/abs/2309.12053},
39
+ }```
40
+
41
+ ```
42
+ @inproceedings{hardalov-etal-2020-exams,
43
+ title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
44
+ author = "Hardalov, Momchil and
45
+ Mihaylov, Todor and
46
+ Zlatkova, Dimitrina and
47
+ Dinkov, Yoan and
48
+ Koychev, Ivan and
49
+ Nakov, Preslav",
50
+ editor = "Webber, Bonnie and
51
+ Cohn, Trevor and
52
+ He, Yulan and
53
+ Liu, Yang",
54
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
55
+ month = nov,
56
+ year = "2020",
57
+ address = "Online",
58
+ publisher = "Association for Computational Linguistics",
59
+ url = "https://aclanthology.org/2020.emnlp-main.438/",
60
+ doi = "10.18653/v1/2020.emnlp-main.438",
61
+ pages = "5427--5444",
62
+ abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
63
+ }```
64
+ """ # noqa: E501
65
+
66
+ name = "arabic_exams"
67
+ description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
68
+ tags = ["knowledge", "multiple_choice"]
69
+
70
+ CHOICES = ["A", "B", "C", "D"]
71
+ # Remap validation split to train split
72
+ HF_SPLIT_TO_HELM_SPLIT = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
73
+
74
+ def __init__(self, subject: str):
75
+ super().__init__()
76
+ self.subject: str = subject.replace("_", " ")
77
+
78
+ def get_instances(self, output_path: str) -> List[Instance]:
79
+ cache_dir = os.path.join(output_path, "data")
80
+ ensure_directory_exists(cache_dir)
81
+ dataset_splits = datasets.load_dataset(
82
+ "OALL/Arabic_EXAMS",
83
+ revision="bc7a29346dbcaa16a8cd883b1f3e681ab2b7ff2a",
84
+ cache_dir=cache_dir,
85
+ )
86
+
87
+ instances: List[Instance] = []
88
+ for split_name, dataset in dataset_splits.items():
89
+ for row in dataset:
90
+ subject = row["id"].split("-")[0]
91
+ if self.subject != "all" and self.subject != subject:
92
+ continue
93
+ input = Input(text=row["question"])
94
+ references: List[Reference] = []
95
+ if row["answer"] not in self.CHOICES:
96
+ hwarn(f"Invalid value in answer column in row: {row}")
97
+ continue
98
+ correct_choice = row["answer"]
99
+ for choice in self.CHOICES:
100
+ references.append(
101
+ Reference(
102
+ output=Output(text=row[choice]),
103
+ tags=[CORRECT_TAG] if choice == correct_choice else [],
104
+ )
105
+ )
106
+ instance = Instance(
107
+ id=row["id"],
108
+ input=input,
109
+ references=references,
110
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
111
+ )
112
+ instances.append(instance)
113
+
114
+ return instances
@@ -19,8 +19,6 @@ from helm.benchmark.scenarios.scenario import (
19
19
  class ArabicMMLUScenario(Scenario):
20
20
  """ArabicMMLU
21
21
 
22
- EXPERIMENTAL: This scenario may have future reverse incompatible changes.
23
-
24
22
  ArabicMMLU is the first multi-task language understanding benchmark
25
23
  for Arabic language, sourced from school exams across diverse educational
26
24
  levels in different countries spanning North Africa, the Levant, and the
@@ -39,12 +37,16 @@ class ArabicMMLUScenario(Scenario):
39
37
  OPTIONS = ["A", "B", "C", "D"]
40
38
  HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
41
39
 
40
+ def __init__(self, subset: str):
41
+ super().__init__()
42
+ self.subset = subset.replace("_", " ")
43
+
42
44
  def get_instances(self, output_path: str) -> List[Instance]:
43
45
  cache_dir = os.path.join(output_path, "data")
44
46
  ensure_directory_exists(cache_dir)
45
47
  dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
46
48
  "MBZUAI/ArabicMMLU",
47
- "All",
49
+ self.subset,
48
50
  revision="7aa530e2893ac420352b3f5c1a1310c010e9758b",
49
51
  cache_dir=cache_dir,
50
52
  )
@@ -63,7 +65,9 @@ class ArabicMMLUScenario(Scenario):
63
65
  continue
64
66
  references.append(
65
67
  Reference(
66
- output=Output(text=row[column_name]),
68
+ # Need to convert column to string because the references are floats
69
+ # for the subject "Math (Primary School)"
70
+ output=Output(text=str(row[column_name])),
67
71
  tags=[CORRECT_TAG] if option_index == correct_option_index else [],
68
72
  )
69
73
  )
@@ -47,8 +47,25 @@ class AraTrustScenario(Scenario):
47
47
  description = "aratrust"
48
48
  tags = ["trustworthiness"]
49
49
 
50
+ CATEGORIES = [
51
+ "Ethics",
52
+ "Illegal",
53
+ "Mental Health",
54
+ "Offensive",
55
+ "Physical Health",
56
+ "Privacy",
57
+ "Trustfulness",
58
+ "Unfairness",
59
+ ]
50
60
  OPTION_KEYS = ["A", "B", "C"]
51
61
 
62
+ def __init__(self, category: str):
63
+ super().__init__()
64
+ category = category.replace("_", " ")
65
+ if category not in self.CATEGORIES and category != "all":
66
+ raise Exception(f"Unknown category {category}")
67
+ self.category = category
68
+
52
69
  def get_instances(self, output_path: str) -> List[Instance]:
53
70
  cache_dir = os.path.join(output_path, "data")
54
71
  ensure_directory_exists(cache_dir)
@@ -60,6 +77,8 @@ class AraTrustScenario(Scenario):
60
77
  )
61
78
  instances: List[Instance] = []
62
79
  for row_index, row in enumerate(dataset):
80
+ if self.category != "all" and self.category != row["Category"]:
81
+ continue
63
82
  question_text = row["Question"]
64
83
  option_texts = [row[option_key] for option_key in self.OPTION_KEYS if row[option_key]]
65
84
  joined_option_texts = "\n".join(option_texts)
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
49
18
 
50
19
 
51
20
  class UltraSuiteASRClassificationScenario(Scenario):
@@ -59,9 +28,6 @@ class UltraSuiteASRClassificationScenario(Scenario):
59
28
  description = "A scenario for evaluating speech disorders in children"
60
29
  tags = ["audio", "classification", "speech_disorder", "asr"]
61
30
 
62
- # Classification options
63
- options: List[str] = ["Healthy", "Unhealthy"]
64
-
65
31
  def get_instances(self, output_path: str) -> List[Instance]:
66
32
  """
67
33
  Create instances from the audio files and their corresponding JSON annotations.
@@ -69,36 +35,40 @@ class UltraSuiteASRClassificationScenario(Scenario):
69
35
  - Audio files (e.g., .mp3)
70
36
  - A JSON file with annotations containing 'answer' field
71
37
  """
72
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
73
- data_path = snapshot_download(
74
- repo_id="SAA-Lab/SLPHelmManualLabels",
75
- repo_type="dataset",
76
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
77
- )
38
+
39
+ audio_save_dir = os.path.join(output_path, "audio_files")
40
+ os.makedirs(audio_save_dir, exist_ok=True)
41
+
42
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
43
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
78
44
 
79
45
  instances: List[Instance] = []
80
46
  split: str = TEST_SPLIT
81
47
 
82
- # Find all pairs of audio and JSON files
83
- pairs = find_audio_json_pairs(data_path)
48
+ for idx, row in enumerate(tqdm(dataset["train"])):
84
49
 
85
- for audio_path, json_path in tqdm(pairs):
50
+ label = row["disorder_class"]
51
+ transcription = row["transcription"]
86
52
 
87
- # Load the annotation
88
- with open(json_path, "r") as f:
89
- annotation = json.load(f)
53
+ unique_id = str(idx)
54
+ local_audio_name = f"{label}_{unique_id}.mp3"
55
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
56
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
90
57
 
91
- # Get the correct answer and convert to label
92
- answer = annotation["disorder_class"]
93
58
  # Create references for each option
94
- references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
59
+ references: List[Reference] = []
60
+ for option in ["typically_developing", "speech_disorder"]:
61
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
62
+ references.append(reference)
95
63
 
96
64
  # Create the input with audio and instruction
97
65
  content = [
98
- MediaObject(content_type="audio/mpeg", location=audio_path),
66
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
99
67
  ]
100
68
 
101
69
  input = Input(multimedia_content=MultimediaObject(content))
102
- instances.append(Instance(input=input, references=references, split=split))
70
+ instances.append(
71
+ Instance(input=input, references=references, split=split, extra_data={"transcription": transcription})
72
+ )
103
73
 
104
74
  return instances
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
49
18
 
50
19
 
51
20
  class UltraSuiteASRTranscriptionScenario(Scenario):
@@ -66,31 +35,33 @@ class UltraSuiteASRTranscriptionScenario(Scenario):
66
35
  - Audio files (e.g., .mp3)
67
36
  - A JSON file with annotations containing 'answer' field
68
37
  """
69
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
70
- data_path = snapshot_download(
71
- repo_id="SAA-Lab/SLPHelmManualLabels",
72
- repo_type="dataset",
73
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
74
- )
38
+ audio_save_dir = os.path.join(output_path, "audio_files")
39
+ os.makedirs(audio_save_dir, exist_ok=True)
40
+
41
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
42
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
75
43
 
76
44
  instances: List[Instance] = []
77
45
  split: str = TEST_SPLIT
78
46
 
79
47
  # Find all pairs of audio and JSON files
80
- pairs = find_audio_json_pairs(data_path)
81
-
82
- for audio_path, json_path in tqdm(pairs):
48
+ for idx, row in enumerate(tqdm(dataset["train"])):
83
49
 
84
50
  # Load the annotation
85
- with open(json_path, "r") as f:
86
- annotation = json.load(f)
51
+ # Load the annotation
52
+ label = row["disorder_class"]
53
+
54
+ unique_id = str(idx)
55
+ local_audio_name = f"{label}_{unique_id}.mp3"
56
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
57
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
87
58
 
88
- # Create references for the transcription
89
- references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
59
+ # Create references for each option
60
+ references: List[Reference] = [Reference(Output(text=row["transcription"]), tags=[CORRECT_TAG])]
90
61
 
91
62
  # Create the input with audio and instruction
92
63
  content = [
93
- MediaObject(content_type="audio/mpeg", location=audio_path),
64
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
94
65
  ]
95
66
 
96
67
  input = Input(multimedia_content=MultimediaObject(content))