crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -3,7 +3,12 @@
3
3
  Website: https://crfm.stanford.edu/helm/medhelm/
4
4
  """
5
5
 
6
- from typing import Union
6
+ import importlib.resources as pkg_resources
7
+
8
+ import os
9
+ from typing import Dict, Union, Optional
10
+
11
+ import yaml
7
12
 
8
13
  from helm.benchmark.adaptation.adapter_spec import (
9
14
  ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -13,6 +18,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
13
18
  get_multiple_choice_adapter_spec,
14
19
  )
15
20
  from helm.benchmark.annotation.annotator import AnnotatorSpec
21
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
16
22
  from helm.benchmark.metrics.common_metric_specs import (
17
23
  get_basic_metric_specs,
18
24
  get_exact_match_metric_specs,
@@ -22,10 +28,69 @@ from helm.benchmark.metrics.common_metric_specs import (
22
28
  )
23
29
  from helm.benchmark.metrics.metric import MetricSpec
24
30
  from helm.benchmark.run_spec import RunSpec, run_spec_function
31
+ from helm.benchmark.run_specs.medhelm.benchmark_config import get_benchmark_config_from_path
25
32
  from helm.benchmark.scenarios.scenario import ScenarioSpec
26
33
  from helm.common.gpu_utils import get_torch_device_name
27
34
 
28
35
 
36
+ def get_judges_config(jury_config_path: Optional[str]) -> dict:
37
+ package = "helm.benchmark.scenarios.medhelm"
38
+ default_config_path = str(pkg_resources.files(package).joinpath("judges.yaml"))
39
+
40
+ if jury_config_path is None:
41
+ # Use the default config bundled with the package
42
+ jury_config_path = default_config_path
43
+
44
+ assert os.path.exists(jury_config_path), (
45
+ f"Judges config file not found: {jury_config_path}. "
46
+ f"If you are providing a custom config, make sure it follows the format specified in "
47
+ f"the default file: {default_config_path}"
48
+ )
49
+
50
+ with open(jury_config_path, "r") as f:
51
+ config = yaml.safe_load(f)
52
+
53
+ return config
54
+
55
+
56
+ def get_annotator_models_from_config(jury_config_path: Optional[str]) -> Dict[str, AnnotatorModelInfo]:
57
+ config = get_judges_config(jury_config_path)
58
+ annotator_models = {
59
+ judge["name"]: AnnotatorModelInfo(
60
+ model_name=judge["model"],
61
+ model_deployment=judge["model_deployment"],
62
+ )
63
+ for judge in config["judges"]
64
+ }
65
+ return annotator_models
66
+
67
+
68
+ @run_spec_function("medhelm_configurable_benchmark")
69
+ def get_medhelm_configurable_benchmark_spec(config_path: str) -> RunSpec:
70
+ benchmark_config = get_benchmark_config_from_path(config_path)
71
+ scenario_spec = ScenarioSpec(
72
+ class_name="helm.benchmark.scenarios.medhelm_configurable_scenario.MedHELMConfigurableScenario",
73
+ args={"name": benchmark_config.name, "config_path": config_path},
74
+ )
75
+
76
+ adapter_spec = get_generation_adapter_spec(
77
+ max_tokens=benchmark_config.max_tokens,
78
+ max_train_instances=0,
79
+ stop_sequences=[],
80
+ )
81
+ annotator_specs = benchmark_config.get_annotator_specs()
82
+ metric_specs = benchmark_config.get_metric_specs()
83
+
84
+ return RunSpec(
85
+ name=benchmark_config.name,
86
+ scenario_spec=scenario_spec,
87
+ adapter_spec=adapter_spec,
88
+ annotators=annotator_specs,
89
+ metric_specs=metric_specs,
90
+ groups=[benchmark_config.name],
91
+ )
92
+
93
+
29
94
  @run_spec_function("medcalc_bench")
30
95
  def get_medcalc_bench_spec() -> RunSpec:
31
96
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario")
@@ -91,7 +156,7 @@ def get_clear_spec(condition: str, data_path: str) -> RunSpec:
91
156
 
92
157
 
93
158
  @run_spec_function("mtsamples_replicate")
94
- def get_mtsamples_spec() -> RunSpec:
159
+ def get_mtsamples_spec(jury_config_path: Optional[str] = None) -> RunSpec:
95
160
  scenario_spec = ScenarioSpec(
96
161
  class_name="helm.benchmark.scenarios.mtsamples_replicate_scenario.MTSamplesReplicateScenario"
97
162
  )
@@ -106,8 +171,15 @@ def get_mtsamples_spec() -> RunSpec:
106
171
  stop_sequences=[],
107
172
  )
108
173
 
174
+ annotator_models = get_annotator_models_from_config(jury_config_path)
175
+
109
176
  annotator_specs = [
110
- AnnotatorSpec(class_name="helm.benchmark.annotation.mtsamples_replicate_annotator.MTSamplesReplicateAnnotator")
177
+ AnnotatorSpec(
178
+ class_name="helm.benchmark.annotation.mtsamples_replicate_annotator.MTSamplesReplicateAnnotator",
179
+ args={
180
+ "annotator_models": annotator_models,
181
+ },
182
+ )
111
183
  ]
112
184
 
113
185
  metric_args = {
@@ -118,7 +190,15 @@ def get_mtsamples_spec() -> RunSpec:
118
190
  }
119
191
 
120
192
  metric_specs = get_summarization_metric_specs(metric_args) + [
121
- MetricSpec(class_name="helm.benchmark.metrics.mtsamples_replicate_metrics.MTSamplesReplicateMetric", args={})
193
+ MetricSpec(
194
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
195
+ args={
196
+ "metric_name": "mtsamples_replicate_accuracy",
197
+ "scenario_name": "mtsamples_replicate",
198
+ "annotator_models": annotator_models,
199
+ "default_score": 1.0,
200
+ },
201
+ )
122
202
  ]
123
203
 
124
204
  return RunSpec(
@@ -293,6 +373,50 @@ def get_medbullets_run_spec() -> RunSpec:
293
373
  )
294
374
 
295
375
 
376
+ @run_spec_function("medhelm_med_qa")
377
+ def get_medhelm_med_qa_spec() -> RunSpec:
378
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
379
+
380
+ adapter_spec = get_multiple_choice_adapter_spec(
381
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
382
+ instructions="Give a letter answer among A, B, C or D. Do not include any explanation or additional text.",
383
+ input_noun="Question",
384
+ output_noun="Respond only with 'A', 'B', 'C' or 'D'. Do not add any other text, punctuation, or symbols.",
385
+ max_tokens=1,
386
+ max_train_instances=0,
387
+ )
388
+
389
+ return RunSpec(
390
+ name="med_qa",
391
+ scenario_spec=scenario_spec,
392
+ adapter_spec=adapter_spec,
393
+ metric_specs=get_exact_match_metric_specs(),
394
+ groups=["med_qa"],
395
+ )
396
+
397
+
398
+ @run_spec_function("medhelm_med_mcqa")
399
+ def get_medhelm_med_mcqa_spec() -> RunSpec:
400
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
401
+
402
+ adapter_spec = get_multiple_choice_adapter_spec(
403
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
404
+ instructions="Give a letter answer among A, B, C or D. Do not include any explanation or additional text.",
405
+ input_noun="Question",
406
+ output_noun="Respond only with 'A', 'B', 'C' or 'D'. Do not add any other text, punctuation, or symbols.",
407
+ max_tokens=1,
408
+ max_train_instances=0,
409
+ )
410
+
411
+ return RunSpec(
412
+ name="med_mcqa",
413
+ scenario_spec=scenario_spec,
414
+ adapter_spec=adapter_spec,
415
+ metric_specs=get_exact_match_metric_specs(),
416
+ groups=["med_mcqa"],
417
+ )
418
+
419
+
296
420
  @run_spec_function("medbullets_freetext")
297
421
  def get_medbullets_freetext_run_spec() -> RunSpec:
298
422
  """RunSpec for the MedBullets Free-text dataset."""
@@ -330,7 +454,7 @@ def get_medbullets_freetext_run_spec() -> RunSpec:
330
454
 
331
455
 
332
456
  @run_spec_function("medalign")
333
- def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
457
+ def get_medalign_spec(data_path: str, jury_config_path: Optional[str] = None, max_length: int = 100000) -> RunSpec:
334
458
  scenario_spec = ScenarioSpec(
335
459
  class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario",
336
460
  args={
@@ -349,7 +473,16 @@ def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
349
473
  max_train_instances=0,
350
474
  )
351
475
 
352
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.medalign_annotator.MedalignAnnotator")]
476
+ annotator_models = get_annotator_models_from_config(jury_config_path)
477
+
478
+ annotator_specs = [
479
+ AnnotatorSpec(
480
+ class_name="helm.benchmark.annotation.medalign_annotator.MedalignAnnotator",
481
+ args={
482
+ "annotator_models": annotator_models,
483
+ },
484
+ )
485
+ ]
353
486
 
354
487
  metric_args = {
355
488
  "task": "medalign",
@@ -358,7 +491,15 @@ def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
358
491
  "rescale_with_baseline": False,
359
492
  }
360
493
  metric_specs = get_summarization_metric_specs(metric_args) + [
361
- MetricSpec(class_name="helm.benchmark.metrics.medalign_metrics.MedalignMetric", args={})
494
+ MetricSpec(
495
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
496
+ args={
497
+ "metric_name": "medalign_accuracy",
498
+ "scenario_name": "medalign",
499
+ "annotator_models": annotator_models,
500
+ "default_score": 1.0,
501
+ },
502
+ )
362
503
  ]
363
504
 
364
505
  return RunSpec(
@@ -418,7 +559,7 @@ def get_shc_sei_spec(data_path: str) -> RunSpec:
418
559
 
419
560
 
420
561
  @run_spec_function("dischargeme")
421
- def get_dischargeme_spec(data_path: str) -> RunSpec:
562
+ def get_dischargeme_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
422
563
  scenario_spec = ScenarioSpec(
423
564
  class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario",
424
565
  args={
@@ -440,7 +581,16 @@ def get_dischargeme_spec(data_path: str) -> RunSpec:
440
581
  max_train_instances=0,
441
582
  )
442
583
 
443
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.dischargeme_annotator.DischargeMeAnnotator")]
584
+ annotator_models = get_annotator_models_from_config(jury_config_path)
585
+
586
+ annotator_specs = [
587
+ AnnotatorSpec(
588
+ class_name="helm.benchmark.annotation.dischargeme_annotator.DischargeMeAnnotator",
589
+ args={
590
+ "annotator_models": annotator_models,
591
+ },
592
+ )
593
+ ]
444
594
 
445
595
  metric_args = {
446
596
  "task": "dischargeme",
@@ -449,7 +599,15 @@ def get_dischargeme_spec(data_path: str) -> RunSpec:
449
599
  "rescale_with_baseline": False,
450
600
  }
451
601
  metric_specs = get_summarization_metric_specs(metric_args) + [
452
- MetricSpec(class_name="helm.benchmark.metrics.dischargeme_metrics.DischargeMeMetric", args={})
602
+ MetricSpec(
603
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
604
+ args={
605
+ "metric_name": "dischargeme_accuracy",
606
+ "scenario_name": "dischargeme",
607
+ "annotator_models": annotator_models,
608
+ "default_score": 1.0,
609
+ },
610
+ )
453
611
  ]
454
612
  return RunSpec(
455
613
  name="dischargeme",
@@ -462,13 +620,12 @@ def get_dischargeme_spec(data_path: str) -> RunSpec:
462
620
 
463
621
 
464
622
  @run_spec_function("aci_bench")
465
- def get_aci_bench_run_spec() -> RunSpec:
623
+ def get_aci_bench_run_spec(jury_config_path: Optional[str] = None) -> RunSpec:
466
624
  """
467
625
  RunSpec for the ACI-Bench dataset.
468
626
  This configuration evaluates the model's ability to summarize
469
627
  doctor-patient dialogues into structured clinical notes.
470
628
  """
471
- # Define the scenario
472
629
  scenario_spec = ScenarioSpec(
473
630
  class_name="helm.benchmark.scenarios.aci_bench_scenario.ACIBenchScenario",
474
631
  args={},
@@ -491,7 +648,16 @@ def get_aci_bench_run_spec() -> RunSpec:
491
648
  stop_sequences=[],
492
649
  )
493
650
 
494
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.aci_bench_annotator.ACIBenchAnnotator")]
651
+ annotator_models = get_annotator_models_from_config(jury_config_path)
652
+
653
+ annotator_specs = [
654
+ AnnotatorSpec(
655
+ class_name="helm.benchmark.annotation.aci_bench_annotator.ACIBenchAnnotator",
656
+ args={
657
+ "annotator_models": annotator_models,
658
+ },
659
+ )
660
+ ]
495
661
 
496
662
  # Define the metrics
497
663
  metric_args = {
@@ -501,7 +667,15 @@ def get_aci_bench_run_spec() -> RunSpec:
501
667
  "rescale_with_baseline": False,
502
668
  }
503
669
  metric_specs = get_summarization_metric_specs(metric_args) + [
504
- MetricSpec(class_name="helm.benchmark.metrics.aci_bench_metrics.ACIBenchMetric", args={})
670
+ MetricSpec(
671
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
672
+ args={
673
+ "metric_name": "aci_bench_accuracy",
674
+ "scenario_name": "aci_bench",
675
+ "annotator_models": annotator_models,
676
+ "default_score": 1.0,
677
+ },
678
+ )
505
679
  ]
506
680
 
507
681
  # Return the RunSpec
@@ -516,7 +690,7 @@ def get_aci_bench_run_spec() -> RunSpec:
516
690
 
517
691
 
518
692
  @run_spec_function("mtsamples_procedures")
519
- def get_mtsamples_procedures_spec() -> RunSpec:
693
+ def get_mtsamples_procedures_spec(jury_config_path: Optional[str] = None) -> RunSpec:
520
694
  scenario_spec = ScenarioSpec(
521
695
  class_name="helm.benchmark.scenarios.mtsamples_procedures_scenario.MTSamplesProceduresScenario"
522
696
  )
@@ -530,10 +704,14 @@ def get_mtsamples_procedures_spec() -> RunSpec:
530
704
  max_train_instances=0,
531
705
  stop_sequences=[],
532
706
  )
707
+ annotator_models = get_annotator_models_from_config(jury_config_path)
533
708
 
534
709
  annotator_specs = [
535
710
  AnnotatorSpec(
536
- class_name="helm.benchmark.annotation.mtsamples_procedures_annotator.MTSamplesProceduresAnnotator"
711
+ class_name="helm.benchmark.annotation.mtsamples_procedures_annotator.MTSamplesProceduresAnnotator",
712
+ args={
713
+ "annotator_models": annotator_models,
714
+ },
537
715
  )
538
716
  ]
539
717
 
@@ -545,7 +723,15 @@ def get_mtsamples_procedures_spec() -> RunSpec:
545
723
  }
546
724
 
547
725
  metric_specs = get_summarization_metric_specs(metric_args) + [
548
- MetricSpec(class_name="helm.benchmark.metrics.mtsamples_procedures_metrics.MTSamplesProceduresMetric", args={})
726
+ MetricSpec(
727
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
728
+ args={
729
+ "metric_name": "mtsamples_procedures_accuracy",
730
+ "scenario_name": "mtsamples_procedures",
731
+ "annotator_models": annotator_models,
732
+ "default_score": 1.0,
733
+ },
734
+ )
549
735
  ]
550
736
 
551
737
  return RunSpec(
@@ -559,7 +745,7 @@ def get_mtsamples_procedures_spec() -> RunSpec:
559
745
 
560
746
 
561
747
  @run_spec_function("mimic_rrs")
562
- def get_mimic_rrs_spec(data_path: str) -> RunSpec:
748
+ def get_mimic_rrs_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
563
749
  scenario_spec = ScenarioSpec(
564
750
  class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario",
565
751
  args={"data_path": data_path},
@@ -578,7 +764,17 @@ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
578
764
  max_train_instances=0,
579
765
  stop_sequences=[],
580
766
  )
581
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.mimic_rrs_annotator.MIMICRRSAnnotator")]
767
+
768
+ annotator_models = get_annotator_models_from_config(jury_config_path)
769
+
770
+ annotator_specs = [
771
+ AnnotatorSpec(
772
+ class_name="helm.benchmark.annotation.mimic_rrs_annotator.MIMICRRSAnnotator",
773
+ args={
774
+ "annotator_models": annotator_models,
775
+ },
776
+ )
777
+ ]
582
778
 
583
779
  metric_args = {
584
780
  "task": "mimic_rrs",
@@ -587,7 +783,15 @@ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
587
783
  "rescale_with_baseline": False,
588
784
  }
589
785
  metric_specs = get_summarization_metric_specs(metric_args) + [
590
- MetricSpec(class_name="helm.benchmark.metrics.mimic_rrs_metrics.MIMICRRSMetric", args={})
786
+ MetricSpec(
787
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
788
+ args={
789
+ "metric_name": "mimic_rrs_accuracy",
790
+ "scenario_name": "mimic_rrs",
791
+ "annotator_models": annotator_models,
792
+ "default_score": 1.0,
793
+ },
794
+ )
591
795
  ]
592
796
  return RunSpec(
593
797
  name="mimic_rrs",
@@ -600,7 +804,7 @@ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
600
804
 
601
805
 
602
806
  @run_spec_function("mimic_bhc")
603
- def get_mimic_bhc_spec(data_path: str) -> RunSpec:
807
+ def get_mimic_bhc_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
604
808
  scenario_spec = ScenarioSpec(
605
809
  class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario",
606
810
  args={"data_path": data_path},
@@ -616,7 +820,17 @@ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
616
820
  max_train_instances=0,
617
821
  stop_sequences=[],
618
822
  )
619
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.mimic_bhc_annotator.MIMICBHCAnnotator")]
823
+
824
+ annotator_models = get_annotator_models_from_config(jury_config_path)
825
+
826
+ annotator_specs = [
827
+ AnnotatorSpec(
828
+ class_name="helm.benchmark.annotation.mimic_bhc_annotator.MIMICBHCAnnotator",
829
+ args={
830
+ "annotator_models": annotator_models,
831
+ },
832
+ )
833
+ ]
620
834
 
621
835
  metric_args = {
622
836
  "task": "mimic_bhc",
@@ -625,7 +839,15 @@ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
625
839
  "rescale_with_baseline": False,
626
840
  }
627
841
  metric_specs = get_summarization_metric_specs(metric_args) + [
628
- MetricSpec(class_name="helm.benchmark.metrics.mimic_bhc_metrics.MIMICBHCMetric", args={})
842
+ MetricSpec(
843
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
844
+ args={
845
+ "metric_name": "mimic_bhc_accuracy",
846
+ "scenario_name": "mimic_bhc",
847
+ "annotator_models": annotator_models,
848
+ "default_score": 1.0,
849
+ },
850
+ )
629
851
  ]
630
852
  return RunSpec(
631
853
  name="mimic_bhc",
@@ -638,7 +860,7 @@ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
638
860
 
639
861
 
640
862
  @run_spec_function("chw_care_plan")
641
- def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
863
+ def get_chw_care_plan_run_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
642
864
  """
643
865
  RunSpec for the chw_care_plan dataset.
644
866
  This configuration evaluates the model's ability to summarize
@@ -659,8 +881,16 @@ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
659
881
  max_train_instances=0,
660
882
  stop_sequences=[],
661
883
  )
884
+
885
+ annotator_models = get_annotator_models_from_config(jury_config_path)
886
+
662
887
  annotator_specs = [
663
- AnnotatorSpec(class_name="helm.benchmark.annotation.chw_care_plan_annotator.CHWCarePlanAnnotator")
888
+ AnnotatorSpec(
889
+ class_name="helm.benchmark.annotation.chw_care_plan_annotator.CHWCarePlanAnnotator",
890
+ args={
891
+ "annotator_models": annotator_models,
892
+ },
893
+ )
664
894
  ]
665
895
 
666
896
  metric_args = {
@@ -670,7 +900,15 @@ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
670
900
  "rescale_with_baseline": False,
671
901
  }
672
902
  metric_specs = get_summarization_metric_specs(metric_args) + [
673
- MetricSpec(class_name="helm.benchmark.metrics.chw_care_plan_metrics.CHWCarePlanMetric", args={})
903
+ MetricSpec(
904
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
905
+ args={
906
+ "metric_name": "chw_care_plan_accuracy",
907
+ "scenario_name": "chw_care_plan",
908
+ "annotator_models": annotator_models,
909
+ "default_score": 1.0,
910
+ },
911
+ )
674
912
  ]
675
913
  # Return the RunSpec
676
914
  return RunSpec(
@@ -684,7 +922,7 @@ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
684
922
 
685
923
 
686
924
  @run_spec_function("medication_qa")
687
- def get_medication_qa_spec() -> RunSpec:
925
+ def get_medication_qa_spec(jury_config_path: Optional[str] = None) -> RunSpec:
688
926
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
689
927
 
690
928
  adapter_spec = get_generation_adapter_spec(
@@ -695,8 +933,15 @@ def get_medication_qa_spec() -> RunSpec:
695
933
  max_tokens=512,
696
934
  stop_sequences=[],
697
935
  )
936
+ annotator_models = get_annotator_models_from_config(jury_config_path)
937
+
698
938
  annotator_specs = [
699
- AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
939
+ AnnotatorSpec(
940
+ class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator",
941
+ args={
942
+ "annotator_models": annotator_models,
943
+ },
944
+ )
700
945
  ]
701
946
  metric_args = {
702
947
  "task": "medication_qa",
@@ -705,7 +950,15 @@ def get_medication_qa_spec() -> RunSpec:
705
950
  "rescale_with_baseline": False,
706
951
  }
707
952
  metric_specs = get_summarization_metric_specs(metric_args) + [
708
- MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAMetric", args={})
953
+ MetricSpec(
954
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
955
+ args={
956
+ "metric_name": "medication_qa_accuracy",
957
+ "scenario_name": "medication_qa",
958
+ "annotator_models": annotator_models,
959
+ "default_score": 1.0,
960
+ },
961
+ )
709
962
  ]
710
963
  return RunSpec(
711
964
  name="medication_qa",
@@ -718,7 +971,7 @@ def get_medication_qa_spec() -> RunSpec:
718
971
 
719
972
 
720
973
  @run_spec_function("starr_patient_instructions")
721
- def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
974
+ def get_starr_patient_instructions_run_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
722
975
  scenario_spec = ScenarioSpec(
723
976
  class_name="helm.benchmark.scenarios.starr_patient_instructions_scenario.StarrPatientInstructionsScenario",
724
977
  args={"data_path": data_path},
@@ -739,11 +992,16 @@ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
739
992
  max_train_instances=0,
740
993
  stop_sequences=[],
741
994
  )
995
+ annotator_models = get_annotator_models_from_config(jury_config_path)
996
+
742
997
  annotator_specs = [
743
998
  AnnotatorSpec(
744
999
  class_name=(
745
1000
  "helm.benchmark.annotation.starr_patient_instructions_annotator.StarrPatientInstructionsAnnotator"
746
- )
1001
+ ),
1002
+ args={
1003
+ "annotator_models": annotator_models,
1004
+ },
747
1005
  )
748
1006
  ]
749
1007
 
@@ -753,16 +1011,17 @@ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
753
1011
  "bertscore_model": "distilbert-base-uncased",
754
1012
  "rescale_with_baseline": False,
755
1013
  }
756
- metric_specs = (
757
- get_summarization_metric_specs(metric_args)
758
- + [
759
- MetricSpec(
760
- class_name="helm.benchmark.metrics.starr_patient_instructions_metrics.StarrPatientInstructionsMetric",
761
- args={},
762
- )
763
- ]
764
- + get_basic_metric_specs([])
765
- )
1014
+ metric_specs = get_summarization_metric_specs(metric_args) + [
1015
+ MetricSpec(
1016
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
1017
+ args={
1018
+ "metric_name": "starr_patient_instructions_accuracy",
1019
+ "scenario_name": "starr_patient_instructions",
1020
+ "annotator_models": annotator_models,
1021
+ "default_score": 1.0,
1022
+ },
1023
+ )
1024
+ ]
766
1025
  return RunSpec(
767
1026
  name="starr_patient_instructions",
768
1027
  scenario_spec=scenario_spec,
@@ -774,7 +1033,7 @@ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
774
1033
 
775
1034
 
776
1035
  @run_spec_function("med_dialog")
777
- def get_med_dialog_spec(subset: str) -> RunSpec:
1036
+ def get_med_dialog_spec(subset: str, jury_config_path: Optional[str] = None) -> RunSpec:
778
1037
  scenario_spec = ScenarioSpec(
779
1038
  class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
780
1039
  )
@@ -787,7 +1046,17 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
787
1046
  max_train_instances=0,
788
1047
  stop_sequences=[],
789
1048
  )
790
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator")]
1049
+
1050
+ annotator_models = get_annotator_models_from_config(jury_config_path)
1051
+
1052
+ annotator_specs = [
1053
+ AnnotatorSpec(
1054
+ class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator",
1055
+ args={
1056
+ "annotator_models": annotator_models,
1057
+ },
1058
+ )
1059
+ ]
791
1060
 
792
1061
  metric_args = {
793
1062
  "task": "med_dialog",
@@ -796,7 +1065,15 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
796
1065
  "rescale_with_baseline": False,
797
1066
  }
798
1067
  metric_specs = get_summarization_metric_specs(metric_args) + [
799
- MetricSpec(class_name="helm.benchmark.metrics.med_dialog_metrics.MedDialogMetric", args={})
1068
+ MetricSpec(
1069
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
1070
+ args={
1071
+ "metric_name": "med_dialog_accuracy",
1072
+ "scenario_name": "med_dialog",
1073
+ "annotator_models": annotator_models,
1074
+ "default_score": 1.0,
1075
+ },
1076
+ )
800
1077
  ]
801
1078
  return RunSpec(
802
1079
  name=f"med_dialog,subset={subset}",
@@ -832,7 +1109,7 @@ def get_shc_conf_spec(data_path: str) -> RunSpec:
832
1109
 
833
1110
 
834
1111
  @run_spec_function("medi_qa")
835
- def get_medi_qa_spec() -> RunSpec:
1112
+ def get_medi_qa_spec(jury_config_path: Optional[str] = None) -> RunSpec:
836
1113
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medi_qa_scenario.MediQAScenario", args={})
837
1114
 
838
1115
  adapter_spec = get_generation_adapter_spec(
@@ -843,7 +1120,17 @@ def get_medi_qa_spec() -> RunSpec:
843
1120
  max_train_instances=0,
844
1121
  stop_sequences=[],
845
1122
  )
846
- annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.medi_qa_annotator.MediQAAnnotator")]
1123
+
1124
+ annotator_models = get_annotator_models_from_config(jury_config_path)
1125
+
1126
+ annotator_specs = [
1127
+ AnnotatorSpec(
1128
+ class_name="helm.benchmark.annotation.medi_qa_annotator.MediQAAnnotator",
1129
+ args={
1130
+ "annotator_models": annotator_models,
1131
+ },
1132
+ )
1133
+ ]
847
1134
 
848
1135
  metric_args = {
849
1136
  "task": "medi_qa",
@@ -852,7 +1139,15 @@ def get_medi_qa_spec() -> RunSpec:
852
1139
  "rescale_with_baseline": False,
853
1140
  }
854
1141
  metric_specs = get_summarization_metric_specs(metric_args) + [
855
- MetricSpec(class_name="helm.benchmark.metrics.medi_qa_metrics.MediQAMetric", args={})
1142
+ MetricSpec(
1143
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
1144
+ args={
1145
+ "metric_name": "medi_qa_accuracy",
1146
+ "scenario_name": "medi_qa",
1147
+ "annotator_models": annotator_models,
1148
+ "default_score": 1.0,
1149
+ },
1150
+ )
856
1151
  ]
857
1152
  return RunSpec(
858
1153
  name="medi_qa",
@@ -865,7 +1160,7 @@ def get_medi_qa_spec() -> RunSpec:
865
1160
 
866
1161
 
867
1162
  @run_spec_function("mental_health")
868
- def get_mental_health_spec(data_path: str) -> RunSpec:
1163
+ def get_mental_health_spec(data_path: str, jury_config_path: Optional[str] = None) -> RunSpec:
869
1164
  """
870
1165
  Returns the run specification for the mental health counseling scenario.
871
1166
  This scenario evaluates a model's ability to generate appropriate counseling responses
@@ -886,8 +1181,15 @@ def get_mental_health_spec(data_path: str) -> RunSpec:
886
1181
  max_tokens=512,
887
1182
  stop_sequences=[],
888
1183
  )
1184
+ annotator_models = get_annotator_models_from_config(jury_config_path)
1185
+
889
1186
  annotator_specs = [
890
- AnnotatorSpec(class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator")
1187
+ AnnotatorSpec(
1188
+ class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator",
1189
+ args={
1190
+ "annotator_models": annotator_models,
1191
+ },
1192
+ )
891
1193
  ]
892
1194
 
893
1195
  metric_args = {
@@ -897,7 +1199,15 @@ def get_mental_health_spec(data_path: str) -> RunSpec:
897
1199
  "rescale_with_baseline": False,
898
1200
  }
899
1201
  metric_specs = get_summarization_metric_specs(metric_args) + [
900
- MetricSpec(class_name="helm.benchmark.metrics.mental_health_metrics.MentalHealthMetric", args={})
1202
+ MetricSpec(
1203
+ class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
1204
+ args={
1205
+ "metric_name": "mental_health_accuracy",
1206
+ "scenario_name": "mental_health",
1207
+ "annotator_models": annotator_models,
1208
+ "default_score": 1.0,
1209
+ },
1210
+ )
901
1211
  ]
902
1212
 
903
1213
  return RunSpec(
@@ -1217,7 +1527,7 @@ def get_shc_ent_spec(data_path: str) -> RunSpec:
1217
1527
  @run_spec_function("shc_privacy_med")
1218
1528
  def get_shc_privacy_spec(data_path: str) -> RunSpec:
1219
1529
  scenario_spec = ScenarioSpec(
1220
- class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPRIVACYMedScenario",
1530
+ class_name="helm.benchmark.scenarios.shc_privacy_scenario.SHCPRIVACYMedScenario",
1221
1531
  args={"data_path": data_path},
1222
1532
  )
1223
1533
 
@@ -1240,7 +1550,7 @@ def get_shc_privacy_spec(data_path: str) -> RunSpec:
1240
1550
  @run_spec_function("shc_proxy_med")
1241
1551
  def get_shc_proxy_spec(data_path: str) -> RunSpec:
1242
1552
  scenario_spec = ScenarioSpec(
1243
- class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPROXYMedScenario",
1553
+ class_name="helm.benchmark.scenarios.shc_proxy_scenario.SHCPROXYMedScenario",
1244
1554
  args={"data_path": data_path},
1245
1555
  )
1246
1556