crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,68 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MBZUAIHumanTranslatedArabicMMLUScenario(Scenario):
19
+ """MBZUAI Human-Translated Arabic MMLU
20
+
21
+ A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark from this paper:
22
+
23
+ - https://arxiv.org/pdf/2009.03300.pdf
24
+ """ # noqa: E501
25
+
26
+ name = "mbzuai_human_translated_arabic_mmlu"
27
+ description = (
28
+ "A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark"
29
+ )
30
+ tags = ["knowledge", "multiple_choice"]
31
+
32
+ def __init__(self, subject: str):
33
+ super().__init__()
34
+ self.subject: str = subject
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ cache_dir = os.path.join(output_path, "data")
38
+ ensure_directory_exists(cache_dir)
39
+ dataset = datasets.load_dataset(
40
+ "MBZUAI/human_translated_arabic_mmlu",
41
+ self.subject,
42
+ revision="5ed7830fd678cfa6f2d7f0a1a13a4e1a1fa422ac",
43
+ cache_dir=cache_dir,
44
+ split="test",
45
+ )
46
+ assert isinstance(dataset, datasets.Dataset)
47
+
48
+ # Read all instances
49
+ instances: List[Instance] = []
50
+ for row_index, row in enumerate(dataset):
51
+ input = Input(text=row["question"])
52
+ references: List[Reference] = []
53
+ for choice_index, choice in enumerate(row["choices"]):
54
+ references.append(
55
+ Reference(
56
+ output=Output(text=choice),
57
+ tags=[CORRECT_TAG] if choice_index == row["answer"] else [],
58
+ )
59
+ )
60
+ instance = Instance(
61
+ id=f"id-{self.subject}-{row_index}",
62
+ input=input,
63
+ references=references,
64
+ split=TEST_SPLIT,
65
+ )
66
+ instances.append(instance)
67
+
68
+ return instances
@@ -2,8 +2,18 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, Input, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ ALL_SPLITS,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ ScenarioMetadata,
16
+ )
7
17
 
8
18
 
9
19
  class MedDialogScenario(Scenario):
@@ -133,3 +143,24 @@ class MedDialogScenario(Scenario):
133
143
  )
134
144
 
135
145
  return instances
146
+
147
+ def get_metadata(self):
148
+ return ScenarioMetadata(
149
+ name="med_dialog",
150
+ display_name="MedDialog",
151
+ short_display_name="MedDialog",
152
+ description="MedDialog is a benchmark of real-world doctor-patient conversations focused on "
153
+ "health-related concerns and advice. Each dialogue is paired with a "
154
+ "one-sentence summary that reflects the core patient question or exchange. The "
155
+ "benchmark evaluates a model's ability to condense medical dialogue into "
156
+ "concise, informative summaries.",
157
+ taxonomy=TaxonomyInfo(
158
+ task="Text generation",
159
+ what="Generate summaries of doctor-patient conversations",
160
+ when="Any",
161
+ who="Clinician",
162
+ language="English",
163
+ ),
164
+ main_metric="med_dialog_accuracy",
165
+ main_split="test",
166
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  VALID_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -109,3 +111,15 @@ class MedMCQAScenario(Scenario):
109
111
  instances.append(instance)
110
112
 
111
113
  return instances
114
+
115
+ def get_metadata(self):
116
+ return ScenarioMetadata(
117
+ name="med_mcqa",
118
+ display_name="MedMCQA",
119
+ description='MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to '
120
+ "address real-world medical entrance exam questions ([Flores et al. "
121
+ "2020](https://arxiv.org/abs/2203.14371)).",
122
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
123
+ main_metric="exact_match",
124
+ main_split="valid",
125
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  VALID_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -103,3 +105,21 @@ class MedQAScenario(Scenario):
103
105
  instances.append(instance)
104
106
 
105
107
  return instances
108
+
109
+ def get_metadata(self) -> ScenarioMetadata:
110
+ return ScenarioMetadata(
111
+ name="med_qa",
112
+ display_name="MedQA",
113
+ description="MedQA is an open domain question answering dataset composed of questions from "
114
+ "professional medical board exams ([Jin et al. "
115
+ "2020](https://arxiv.org/pdf/2009.13081.pdf)).",
116
+ taxonomy=TaxonomyInfo(
117
+ task="multiple-choice question answering",
118
+ what="US medical licensing exams",
119
+ when="before 2020",
120
+ who="problem setters",
121
+ language="English",
122
+ ),
123
+ main_metric="quasi_exact_match",
124
+ main_split="test",
125
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import List
2
2
 
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.benchmark.scenarios.scenario import (
4
5
  Scenario,
5
6
  Instance,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
8
9
  CORRECT_TAG,
9
10
  PassageQuestionInput,
10
11
  Output,
12
+ ScenarioMetadata,
11
13
  )
12
14
  from helm.benchmark.scenarios.medalign_scenario_helper import return_dataset_dataframe # type: ignore
13
15
 
@@ -92,3 +94,24 @@ class MedalignScenario(Scenario):
92
94
  def get_instances(self, output_path: str) -> List[Instance]:
93
95
  dataset = return_dataset_dataframe(self.max_length, self.data_path)
94
96
  return self.process_tsv(dataset)
97
+
98
+ def get_metadata(self):
99
+ return ScenarioMetadata(
100
+ name="medalign",
101
+ display_name="MedAlign",
102
+ short_display_name="MedAlign",
103
+ description="MedAlign is a benchmark that evaluates a model's ability to interpret and "
104
+ "follow instructions grounded in longitudinal electronic health records (EHR). "
105
+ "Each instance includes an event-stream style patient record and a natural "
106
+ "language question or task, requiring clinically informed reading comprehension "
107
+ "and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).",
108
+ taxonomy=TaxonomyInfo(
109
+ task="Text generation",
110
+ what="Answer questions and follow instructions over longitudinal EHR",
111
+ when="Any",
112
+ who="Clinician, Researcher",
113
+ language="English",
114
+ ),
115
+ main_metric="medalign_accuracy",
116
+ main_split="test",
117
+ )
@@ -2,22 +2,13 @@
2
2
  # type: ignore
3
3
  # fmt: off
4
4
 
5
- import ast
6
- import datetime
7
5
  import transformers
8
- import langchain
9
- import langchain.prompts
10
- import lxml.etree
11
6
  import os
12
7
  import pandas as pd
13
- import re
14
8
  import tiktoken
15
9
 
16
- from langchain_community.retrievers import BM25Retriever
17
10
  from tqdm import tqdm
18
- from typing import Any, Dict, Optional, Union, Callable
19
- from langchain.schema import Document
20
- import langchain_community
11
+ from typing import Any, Dict, Optional, Callable
21
12
 
22
13
  from helm.common.general import check_file_exists
23
14
 
@@ -167,102 +158,13 @@ def get_tokenizer(tokenizer_name: str) -> Callable:
167
158
  return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
168
159
 
169
160
 
170
- def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
171
- """
172
- Retrieve and filter relevant EHR visits based on a query and target length.
173
-
174
- This function retrieves electronic health record (EHR) visit strings, sorts them
175
- by relevance using the BM25Retriever, and constructs a list of final documents
176
- that fit within a specified character length. The final list ensures that the
177
- most important visit isn't cut off and is sorted chronologically.
178
-
179
- Parameters:
180
- ehr_visit_strs (list of str): List of EHR visit strings.
181
- query (str): Query string to retrieve relevant visits.
182
- target_length (int): Maximum total token count for the final list of documents.
183
- tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
184
-
185
- Returns:
186
- list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
187
- """
188
- ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
189
- langchain_docs = [
190
- langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
191
- ]
192
- # `k` is the number of documents to retrieve
193
- # We retrieve everything and just use the BM25Retriever to sort the documents
194
- retriever = langchain_community.retrievers.BM25Retriever.from_documents(
195
- langchain_docs, k=len(langchain_docs)
196
- )
197
-
198
- # Invoking the retriever means the most relevant documents are sorted first
199
- sorted_docs = retriever.invoke(query)
200
-
201
- # Define the regex pattern to find the start time
202
- # pattern = r'start="([\d/]+ [\d:]+)"'
203
- pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
204
-
205
- docs = []
206
- dts = []
207
-
208
- # Find the startime of the document
209
- for doc in sorted_docs:
210
- doc_content = doc.page_content
211
- start_dt_match = re.search(pattern, doc_content)
212
- if start_dt_match:
213
- start_dt = start_dt_match.group(1)
214
- parsed = False
215
- # Try different date formats
216
- for fmt in (
217
- "%m/%d/%y %I:%M %p",
218
- "%m/%d/%Y %I:%M %p",
219
- "%m/%d/%y %H:%M",
220
- "%m/%d/%Y %H:%M",
221
- ):
222
- try:
223
- dts.append(datetime.datetime.strptime(start_dt, fmt))
224
- parsed = True
225
- break
226
- except ValueError:
227
- continue
228
- if not parsed:
229
- print(f"Error parsing date: {start_dt}")
230
- continue
231
- else:
232
- print(f"Start time not found., {doc_content}")
233
- dts.append(datetime.datetime.min)
234
- docs.append(doc_content)
235
-
236
- final_docs = []
237
- current_length = 0
238
-
239
- # Add documents until we exceed the allocated context length
240
- for i in range(len(docs)):
241
- doc_content = docs[i]
242
- doc_length = len(tokenizer.encode(doc_content))
243
- final_docs.append((dts[i], doc_content))
244
- current_length += doc_length
245
- if current_length > target_length:
246
- break
247
-
248
- # Sort final_docs chronologically
249
- final_docs.sort(key=lambda x: x[0])
250
-
251
- # Extract only the document content for the final output
252
- final_docs_content = [doc_content for _, doc_content in final_docs]
253
-
254
- return final_docs_content
255
-
256
-
257
-
258
161
  def pack_and_trim_prompts(
259
162
  instructions: Dict[int, Dict[str, str]],
260
163
  ehrs: Dict[int, str],
261
- prompt_template: langchain.prompts.PromptTemplate,
164
+ prompt_string: str,
262
165
  context_length: int,
263
166
  generation_length: int,
264
167
  tokenizer: Any,
265
- use_RAG: bool = True,
266
168
  verbose: bool = False,
267
169
  include_ehr: bool = True,
268
170
  ) -> Dict[int, str]:
@@ -276,26 +178,15 @@ def pack_and_trim_prompts(
276
178
  patient_id = int(instructions[instruction_id]["patient_id"])
277
179
  relevant_ehr = ehrs[patient_id]
278
180
 
279
- # Calculate how many tokens of EHR we can include in the prompt
280
181
  num_tokens_instruction = len(tokenizer.encode(instruction))
281
- num_tokens_prompt_template = len(tokenizer.encode(prompt_template.template))
182
+ num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
282
183
  if include_ehr:
283
184
  target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
284
185
  else:
285
186
  target_ehr_length = 0
286
187
  if target_ehr_length <= 0:
287
- prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr="")
188
+ prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
288
189
  else:
289
- if use_RAG:
290
- # Return a list of the most relevant visit strings
291
- most_relevant_visits = retrieve_most_relevant_visits(
292
- ehr_visit_strs=relevant_ehr,
293
- query=instruction,
294
- target_length=target_ehr_length,
295
- tokenizer=tokenizer,
296
- )
297
- relevant_ehr = "\n".join(most_relevant_visits)
298
-
299
190
  # Do a first pass with a fast tokenizer
300
191
  fast_tokenizer = tiktoken.get_encoding("cl100k_base")
301
192
  fast_encoded = fast_tokenizer.encode(relevant_ehr)
@@ -307,13 +198,17 @@ def pack_and_trim_prompts(
307
198
  encoded_ehr = tokenizer.encode(fast_truncated_ehr)
308
199
  truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
309
200
  truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
310
- prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr=truncated_ehr)
201
+ prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
202
+ else:
203
+ # If the fast encoding is still too long, just use the full EHR up to allowed length
204
+ truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
205
+ prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
311
206
 
312
- prompts_map[instruction_id] = prompt_with_truncated_ehr
207
+ prompts_map[instruction_id] = prompt_with_truncated_ehr
313
208
 
314
- if verbose:
315
- print(prompt_with_truncated_ehr)
316
- print("~" * 20)
209
+ if verbose:
210
+ print(prompt_with_truncated_ehr)
211
+ print("~" * 20)
317
212
  return prompts_map
318
213
 
319
214
 
@@ -322,7 +217,6 @@ def preprocess_prompts(
322
217
  generation_length,
323
218
  path_to_instructions,
324
219
  path_to_ehrs,
325
- use_RAG,
326
220
  include_ehr,
327
221
  tokenizer,
328
222
  codes_only=False,
@@ -347,16 +241,18 @@ def preprocess_prompts(
347
241
 
348
242
  # CONSTRUCT & TRUNCATE PROMPTS #
349
243
  print("Constructing prompts using instructions and EHRs...")
350
- prompt_string="Instruction: Answer the following question based on the EHR:\n\nEHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
351
- prompt_template = langchain.prompts.PromptTemplate.from_template(prompt_string)
244
+ prompt_string = (
245
+ "Instruction: Answer the following question based on the EHR:\n\n"
246
+ "EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
247
+ )
248
+
352
249
  filled_prompts = pack_and_trim_prompts(
353
250
  instructions=instructions,
354
251
  ehrs=ehrs,
355
- prompt_template=prompt_template,
252
+ prompt_string=prompt_string,
356
253
  context_length=target_context_length,
357
254
  generation_length=generation_length,
358
255
  tokenizer=tokenizer,
359
- use_RAG=use_RAG,
360
256
  verbose=False,
361
257
  include_ehr=include_ehr,
362
258
  )
@@ -415,7 +311,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
415
311
  path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
416
312
  path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
417
313
  check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
418
- use_RAG = False
419
314
  include_ehr = True
420
315
  tokenizer = "tiktoken"
421
316
 
@@ -424,7 +319,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
424
319
  generation_length=generation_length,
425
320
  path_to_instructions=path_to_instructions,
426
321
  path_to_ehrs=path_to_ehrs,
427
- use_RAG=use_RAG,
428
322
  include_ehr=include_ehr,
429
323
  tokenizer=tokenizer,
430
324
  )
@@ -3,6 +3,7 @@ import csv
3
3
  import sys
4
4
  from typing import List
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  CORRECT_TAG,
8
9
  TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Output,
12
13
  Reference,
13
14
  Scenario,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_file_downloaded
16
18
 
@@ -143,3 +145,23 @@ class MedBulletsScenario(Scenario):
143
145
  csv_path = self.download_csv(output_path, split_suffix)
144
146
  instances.extend(self.process_csv(csv_path, split))
145
147
  return instances
148
+
149
+ def get_metadata(self):
150
+ return ScenarioMetadata(
151
+ name="medbullets",
152
+ display_name="Medbullets",
153
+ description="Medbullets is a benchmark of USMLE-style medical questions designed to assess "
154
+ "a model's ability to understand and apply clinical knowledge. Each question is "
155
+ "accompanied by a patient scenario and five multiple-choice options, similar to "
156
+ "those found on Step 2 and Step 3 board exams [(MedBullets, "
157
+ "2025)](https://step2.medbullets.com).",
158
+ taxonomy=TaxonomyInfo(
159
+ task="Question answering",
160
+ what="Medical knowledge testing",
161
+ when="Any",
162
+ who="Medical student, . Researcher",
163
+ language="English",
164
+ ),
165
+ main_metric="exact_match",
166
+ main_split="test",
167
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, List
2
2
  from datasets import load_dataset
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.hierarchical_logger import hlog
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  PassageQuestionInput,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
 
15
17
 
@@ -125,3 +127,23 @@ class MedCalcBenchScenario(Scenario):
125
127
  instances.extend(self.process_csv(data, split))
126
128
 
127
129
  return instances
130
+
131
+ def get_metadata(self):
132
+ return ScenarioMetadata(
133
+ name="medcalc_bench",
134
+ display_name="MedCalc-Bench",
135
+ description="MedCalc-Bench is a benchmark designed to evaluate models on their ability to "
136
+ "compute clinically relevant values from patient notes. Each instance consists "
137
+ "of a clinical note describing the patient's condition, a diagnostic question "
138
+ "targeting a specific medical value, and a ground truth response. [(Khandekar "
139
+ "et al., 2024)](https://arxiv.org/abs/2406.12036).",
140
+ taxonomy=TaxonomyInfo(
141
+ task="Computational reasoning",
142
+ what="Compute a specific medical value from a patient note",
143
+ when="Any",
144
+ who="Clinician, Researcher",
145
+ language="English",
146
+ ),
147
+ main_metric="medcalc_bench_accuracy",
148
+ main_split="test",
149
+ )
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import os
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  TEST_SPLIT,
10
11
  Input,
11
12
  Output,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_file_downloaded
14
16
 
@@ -123,3 +125,24 @@ class MedecScenario(Scenario):
123
125
  instances.extend(self.process_csv(test_csv, TEST_SPLIT))
124
126
 
125
127
  return instances
128
+
129
+ def get_metadata(self):
130
+ return ScenarioMetadata(
131
+ name="medec",
132
+ display_name="Medec",
133
+ description="Medec is a benchmark composed of clinical narratives that include either "
134
+ "correct documentation or medical errors. Each entry includes sentence-level "
135
+ "identifiers and an associated correction task. The model must review the "
136
+ "narrative and either identify the erroneous sentence and correct it, or "
137
+ "confirm that the text is entirely accurate [(Abacha et al., "
138
+ "2025)](https://arxiv.org/abs/2412.19260).",
139
+ taxonomy=TaxonomyInfo(
140
+ task="Classification",
141
+ what="Detect and correct errors in medical narratives",
142
+ when="Any",
143
+ who="Researcher, Clinician",
144
+ language="English",
145
+ ),
146
+ main_metric="medec_error_flag_accuracy",
147
+ main_split="test",
148
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
2
  from datasets import load_dataset
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  CORRECT_TAG,
10
11
  Output,
11
12
  Input,
13
+ ScenarioMetadata,
12
14
  )
13
15
 
14
16
 
@@ -70,3 +72,24 @@ Answer: {answer}
70
72
  )
71
73
  instances.append(hallucinated_instance)
72
74
  return instances
75
+
76
+ def get_metadata(self):
77
+ return ScenarioMetadata(
78
+ name="medhallu",
79
+ display_name="MedHallu",
80
+ description="MedHallu is a benchmark focused on evaluating factual correctness in "
81
+ "biomedical question answering. Each instance contains a PubMed-derived "
82
+ "knowledge snippet, a biomedical question, and a model-generated answer. The "
83
+ "task is to classify whether the answer is factually correct or contains "
84
+ "hallucinated (non-grounded) information. This benchmark is designed to assess "
85
+ "the factual reliability of medical language models.",
86
+ taxonomy=TaxonomyInfo(
87
+ task="Classification",
88
+ what="Verify whether answers to questions from PubMed articles are " "factual or hallucinated",
89
+ when="Any",
90
+ who="Researcher",
91
+ language="English",
92
+ ),
93
+ main_metric="exact_match",
94
+ main_split="test",
95
+ )
File without changes
@@ -0,0 +1,14 @@
1
+ # The judges to be used for evaluating the note summary scenario.
2
+ # name: The short name for the judge.
3
+ # model: The field value matching the 'model_name' field under model_deployments.yaml
4
+ # model_deployment: The field value matching the 'name' under model_deployments.yaml.
5
+ judges:
6
+ - name: "gpt"
7
+ model: "openai/gpt-4o-2024-05-13"
8
+ model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13"
9
+ - name: "llama"
10
+ model: "meta/llama-3.3-70b-instruct"
11
+ model_deployment: "stanfordhealthcare/llama-3.3-70b-instruct"
12
+ - name: "claude"
13
+ model: "anthropic/claude-3-7-sonnet-20250219"
14
+ model_deployment: "stanfordhealthcare/claude-3-7-sonnet-20250219"