crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,41 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- if len(pairs) == 0:
49
- raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
50
-
51
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
52
18
 
53
19
 
54
20
  class UltraSuiteClassificationScenario(Scenario):
@@ -72,44 +38,39 @@ class UltraSuiteClassificationScenario(Scenario):
72
38
  - Audio files (e.g., .mp3)
73
39
  - A JSON file with annotations containing 'answer' field
74
40
  """
41
+ audio_save_dir = os.path.join(output_path, "audio_files")
42
+ os.makedirs(audio_save_dir, exist_ok=True)
75
43
 
76
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
77
- data_path = snapshot_download(
78
- repo_id="SAA-Lab/SLPHelmManualLabels",
79
- repo_type="dataset",
80
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
81
- )
44
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
45
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
82
46
 
83
47
  instances: List[Instance] = []
84
48
  split: str = TEST_SPLIT
85
49
 
86
- # Find all pairs of audio and JSON files
87
- pairs = find_audio_json_pairs(data_path)
88
- print(f"Num pairs: {len(pairs)}")
50
+ for idx, row in enumerate(tqdm(dataset["train"])):
89
51
 
90
- for audio_path, json_path in tqdm(pairs):
91
52
  # Load the annotation
92
- with open(json_path, "r") as f:
93
- annotation = json.load(f)
53
+ label = row["disorder_class"]
54
+ transcription = row["transcription"]
55
+
56
+ unique_id = str(idx)
57
+ local_audio_name = f"{label}_{unique_id}.mp3"
58
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
59
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
94
60
 
95
- # Get the correct answer and convert to label
96
- answer = annotation["disorder_class"]
97
- words = annotation["transcription"]
98
61
  # Create references for each option
99
62
  references: List[Reference] = []
100
- correct_label = 0
101
- for option in ["typically_developing", "speech_disorder"]:
102
- reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
103
- references.append(reference)
104
- if option == answer:
105
- correct_label += 1
106
- if correct_label == 0:
63
+ options = ["typically_developing", "speech_disorder"]
64
+ if label not in options:
107
65
  continue
66
+ for option in options:
67
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
68
+ references.append(reference)
108
69
 
109
70
  # Create the input with audio and instruction
110
71
  content = [
111
- MediaObject(content_type="audio/mpeg", location=audio_path),
112
- MediaObject(content_type="text/plain", text=self.get_instruction(words)),
72
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
73
+ MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
113
74
  ]
114
75
 
115
76
  input = Input(multimedia_content=MultimediaObject(content))
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
- import json
2
+ import os
3
3
 
4
+ from datasets import load_dataset
4
5
  from tqdm import tqdm
5
6
 
6
7
  from helm.benchmark.scenarios.scenario import (
@@ -13,8 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  Output,
14
15
  )
15
16
  from helm.common.media_object import MediaObject, MultimediaObject
16
- from huggingface_hub import snapshot_download
17
- from .ultra_suite_classification_scenario import find_audio_json_pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
18
18
 
19
19
 
20
20
  class UltraSuiteDisorderBreakdownScenario(Scenario):
@@ -38,46 +38,38 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
38
38
  - Audio files (e.g., .mp3)
39
39
  - A JSON file with annotations containing 'disorder_class' field
40
40
  """
41
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
42
- data_path = snapshot_download(
43
- repo_id="SAA-Lab/SLPHelmManualLabels",
44
- repo_type="dataset",
45
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
46
- )
41
+ audio_save_dir = os.path.join(output_path, "audio_files")
42
+ os.makedirs(audio_save_dir, exist_ok=True)
43
+
44
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
45
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
47
46
 
48
47
  instances: List[Instance] = []
49
48
  split: str = TEST_SPLIT
50
49
 
51
- # Find all pairs of audio and JSON files
52
- pairs = find_audio_json_pairs(data_path)
53
- print(f"Num pairs: {len(pairs)}")
54
-
55
- for audio_path, json_path in tqdm(pairs):
50
+ for idx, row in enumerate(tqdm(dataset["train"])):
56
51
  # Load the annotation
57
- with open(json_path, "r") as f:
58
- annotation = json.load(f)
52
+ label = row["disorder_type"]
53
+ transcription = row["transcription"]
59
54
 
60
- # Get the correct answer and convert to label
61
- if "disorder_type" not in annotation or "transcription" not in annotation:
62
- continue
63
- label = annotation["disorder_type"]
64
- prompt = annotation["transcription"]
55
+ unique_id = str(idx)
56
+ local_audio_name = f"{label}_{unique_id}.mp3"
57
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
58
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
65
59
 
66
60
  # Create references for each option
67
61
  references: List[Reference] = []
68
- correct_label = 0
69
- for option in ["typically_developing", "articulation", "phonological"]:
62
+ options = ["typically_developing", "articulation", "phonological"]
63
+ if label not in options:
64
+ continue
65
+ for option in options:
70
66
  reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
71
67
  references.append(reference)
72
- if option == label:
73
- correct_label += 1
74
- if correct_label == 0:
75
- continue
76
68
 
77
69
  # Create the input with audio and instruction
78
70
  content = [
79
- MediaObject(content_type="audio/mpeg", location=audio_path),
80
- MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
71
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
72
+ MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
81
73
  ]
82
74
 
83
75
  input = Input(multimedia_content=MultimediaObject(content))
@@ -1,7 +1,7 @@
1
- from typing import List, Tuple
1
+ from typing import List
2
2
  import os
3
- import json
4
3
 
4
+ from datasets import load_dataset
5
5
  from tqdm import tqdm
6
6
 
7
7
  from helm.benchmark.scenarios.scenario import (
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
  Output,
15
15
  )
16
16
  from helm.common.media_object import MediaObject, MultimediaObject
17
- from huggingface_hub import snapshot_download
18
-
19
-
20
- def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
- """
22
- Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
- Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
-
25
- Args:
26
- directory: Path to the directory containing the files
27
-
28
- Returns:
29
- List of tuples where each tuple contains (mp3_path, json_path)
30
- """
31
- pairs = []
32
-
33
- # Walk through all directories and subdirectories
34
- for root, _, files in os.walk(directory):
35
- # Get all MP3 files in current directory
36
- mp3_files = [f for f in files if f.endswith(".mp3")]
37
-
38
- for mp3_file in mp3_files:
39
- base_name = os.path.splitext(mp3_file)[0]
40
- json_file = f"{base_name}.json"
41
-
42
- # Check if corresponding JSON file exists in the same directory
43
- if json_file in files:
44
- mp3_path = os.path.join(root, mp3_file)
45
- json_path = os.path.join(root, json_file)
46
- pairs.append((mp3_path, json_path))
47
-
48
- return pairs
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
49
18
 
50
19
 
51
20
  class UltraSuiteDisorderSymptomsScenario(Scenario):
@@ -70,45 +39,37 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
70
39
  - Audio files (e.g., .mp3)
71
40
  - A JSON file with annotations containing 'answer' field
72
41
  """
73
- print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
74
- data_path = snapshot_download(
75
- repo_id="SAA-Lab/SLPHelmManualLabels",
76
- repo_type="dataset",
77
- revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
78
- )
42
+ audio_save_dir = os.path.join(output_path, "audio_files")
43
+ os.makedirs(audio_save_dir, exist_ok=True)
44
+
45
+ print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
46
+ dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
79
47
 
80
48
  instances: List[Instance] = []
81
49
  split: str = TEST_SPLIT
82
50
 
83
- # Find all pairs of audio and JSON files
84
- pairs = find_audio_json_pairs(data_path)
85
-
86
- for audio_path, json_path in tqdm(pairs):
51
+ for idx, row in enumerate(tqdm(dataset["train"])):
52
+ label = row["disorder_symptom"]
53
+ transcription = row["transcription"]
87
54
 
88
- # Load the annotation
89
- with open(json_path, "r") as f:
90
- annotation = json.load(f)
55
+ unique_id = str(idx)
56
+ local_audio_name = f"{label}_{unique_id}.mp3"
57
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
58
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
91
59
 
92
- # Get the correct answer and convert to label
93
- if "disorder_symptom" not in annotation or "transcription" not in annotation:
94
- continue
95
- label = annotation["disorder_symptom"]
96
- prompt = annotation["transcription"]
97
60
  # Create references for each option
98
61
  references: List[Reference] = []
99
- correct_label = 0
100
- for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
62
+ options = ["substitution", "omission", "addition", "typically_developing", "stuttering"]
63
+ if label not in options:
64
+ continue
65
+ for option in options:
101
66
  reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
102
67
  references.append(reference)
103
- if option == label:
104
- correct_label += 1
105
- if correct_label == 0:
106
- continue
107
68
 
108
69
  # Create the input with audio and instruction
109
70
  content = [
110
- MediaObject(content_type="audio/mpeg", location=audio_path),
111
- MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
71
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
72
+ MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
112
73
  ]
113
74
 
114
75
  input = Input(multimedia_content=MultimediaObject(content))
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import List
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  PassageQuestionInput,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -139,3 +141,16 @@ class BabiQAScenario(Scenario):
139
141
  story.append(fact)
140
142
 
141
143
  return instances
144
+
145
+ def get_metadata(self) -> ScenarioMetadata:
146
+ return ScenarioMetadata(
147
+ name="babi_qa",
148
+ display_name="bAbI",
149
+ description="The bAbI benchmark for measuring understanding and reasoning [(Weston et al., "
150
+ "2015)](https://arxiv.org/pdf/1502.05698.pdf).",
151
+ taxonomy=TaxonomyInfo(
152
+ task="question answering", what="reasoning", when="2015", who="synthetic", language="English"
153
+ ),
154
+ main_metric="quasi_exact_match",
155
+ main_split="test",
156
+ )
@@ -2,6 +2,7 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  CORRECT_TAG,
7
8
  TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Reference,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_directory_exists
16
18
 
@@ -54,3 +56,22 @@ class Banking77Scenario(Scenario):
54
56
  instance = Instance(input=input, references=references, split=split_name)
55
57
  instances.append(instance)
56
58
  return instances
59
+
60
+ def get_metadata(self) -> ScenarioMetadata:
61
+ return ScenarioMetadata(
62
+ name="banking77",
63
+ display_name="BANKING77",
64
+ short_display_name="BANKING77",
65
+ description="BANKING77 is a benchmark for intent classification of customer service queries "
66
+ "in the banking domain [(Casanueva et al., "
67
+ "2020)](https://aclanthology.org/2020.nlp4convai-1.5/).",
68
+ taxonomy=TaxonomyInfo(
69
+ task="text classification",
70
+ what="customer service queries in the banking domain",
71
+ when="During or before 2020",
72
+ who="banking customers",
73
+ language="English",
74
+ ),
75
+ main_metric="quasi_exact_match",
76
+ main_split="test",
77
+ )
@@ -3,6 +3,7 @@ import os
3
3
  import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  DEFAULT_TEST_SIZE,
15
16
  PassageQuestionInput,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  AMBIGUOUS_TAG = "ambiguous"
@@ -237,3 +239,16 @@ class BBQScenario(Scenario):
237
239
  instances.append(instance)
238
240
 
239
241
  return instances
242
+
243
+ def get_metadata(self) -> ScenarioMetadata:
244
+ return ScenarioMetadata(
245
+ name="bbq",
246
+ display_name="BBQ (Bias Benchmark for Question Answering)",
247
+ short_display_name="BBQ",
248
+ description="The Bias Benchmark for Question Answering (BBQ) for measuring social bias in "
249
+ "question answering in ambiguous and unambigous context [(Parrish et al., "
250
+ "2022)](https://aclanthology.org/2022.findings-acl.165/).",
251
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
252
+ main_metric="bbq_accuracy",
253
+ main_split="test",
254
+ )