crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,432 @@
1
+ # flake8: noqa
2
+ # type: ignore
3
+ # fmt: off
4
+
5
+ import ast
6
+ import datetime
7
+ import transformers
8
+ import langchain
9
+ import langchain.prompts
10
+ import lxml.etree
11
+ import os
12
+ import pandas as pd
13
+ import re
14
+ import tiktoken
15
+
16
+ from langchain_community.retrievers import BM25Retriever
17
+ from tqdm import tqdm
18
+ from typing import Any, Dict, Optional, Union, Callable
19
+ from langchain.schema import Document
20
+ import langchain_community
21
+
22
+ from helm.common.general import check_file_exists
23
+
24
+
25
+ def get_instructions(path_to_instructions: str) -> Dict[int, Dict[str, Any]]:
26
+ """
27
+ Builds map from Instruction ID to instruction details
28
+
29
+ The needed information for creating the map is accomplished by reading
30
+ a CSV file from the user-specified path.
31
+
32
+ The CSV file is expected to contain at least the following columns:
33
+ - instruction_id: The ID of the instruction.
34
+ - question: The text of the instruction.
35
+ - person_id: The ID of the associated patient.
36
+ - is_selected_ehr: A flag indicating whether the instruction is selected.
37
+
38
+ See https://stanfordmedicine.box.com/s/0om9qav2sklb9vaitn0ibye65vgbfx0e
39
+
40
+ Parameters:
41
+ path_to_instructions (str): Path to CSV file containing instructions.
42
+
43
+ Returns:
44
+ Dict[int, Dict[str, Any]]: A dictionary mapping instruction IDs to a
45
+ dictionary containing instruction text and associated patient ID.
46
+
47
+ Raises:
48
+ FileNotFoundError: If the specified file does not exist.
49
+ ValueError: If the CSV file does not contain the expected columns.
50
+ """
51
+ if not os.path.exists(path_to_instructions):
52
+ raise FileNotFoundError(
53
+ f"The specified file {path_to_instructions} does not exist."
54
+ )
55
+
56
+ instructions_df = pd.read_csv(path_to_instructions, sep='\t')
57
+ required_columns = {
58
+ "instruction_id",
59
+ "question",
60
+ "person_id",
61
+ }
62
+ if not required_columns.issubset(instructions_df.columns):
63
+ raise ValueError(
64
+ f"The CSV file is missing one or more of the required columns: {required_columns}"
65
+ )
66
+
67
+ selected_instructions_df = instructions_df #.query("is_selected_ehr == 'yes'")
68
+ instructions_map = {
69
+ row["instruction_id"]: {
70
+ "instruction": row["question"],
71
+ "patient_id": row["person_id"],
72
+ }
73
+ for _, row in selected_instructions_df.iterrows()
74
+ }
75
+ return instructions_map
76
+
77
+
78
+ def extract_patient_id_from_fname(fname: str) -> Optional[int]:
79
+ """
80
+ Extracts and returns the patient ID from a given filename.
81
+
82
+ The function expects filenames in the format 'EHR_<patient_id>.xml',
83
+ where <patient_id> is a sequence of digits.
84
+
85
+ Parameters:
86
+ fname (str): The filename from which to extract the patient ID.
87
+
88
+ Returns:
89
+ Optional[int]: The extracted patient ID as an integer, or None if
90
+ the filename doesn't match the expected format.
91
+ """
92
+ name=fname.split('.')[0]
93
+ return int(name)
94
+
95
+
96
+ def get_ehrs(path_to_ehrs: str) -> Dict[int, str]:
97
+ """
98
+ Builds a map from Instruction ID to EHR (Electronic Health Record) timeline.
99
+
100
+ EHR timelines are in string format and EHR files are read in from the
101
+ user-specified directory. Each file in the directory should be named
102
+ 'EHR_<patient_id>.xml', where <patient_id> is a sequence of digits.
103
+
104
+ See https://stanfordmedicine.box.com/s/r28wfwwude9rpjtu0szhzegmku8qv2pe
105
+
106
+ Parameters:
107
+ path_to_ehrs (str): The path to the directory containing the EHR files.
108
+
109
+ Returns:
110
+ Dict[int, str]: A dictionary mapping patient IDs to EHR timelines.
111
+
112
+ Raises:
113
+ FileNotFoundError: If the specified directory does not exist.
114
+ """
115
+ if not os.path.isdir(path_to_ehrs):
116
+ raise FileNotFoundError(
117
+ f"The specified directory {path_to_ehrs} does not exist."
118
+ )
119
+
120
+ ehr_map = {}
121
+ for fname in os.listdir(path_to_ehrs):
122
+ pt_id = extract_patient_id_from_fname(fname)
123
+ if pt_id is None:
124
+ print(
125
+ f"Warning: File '{fname}' does not match the expected format "
126
+ "and will be skipped."
127
+ )
128
+ continue
129
+
130
+ file_path = os.path.join(path_to_ehrs, fname)
131
+ with open(file_path, encoding="utf-8", mode="r") as f:
132
+ ehr = f.read()
133
+
134
+ ehr_map[pt_id] = ehr
135
+ return ehr_map
136
+
137
+
138
+ def get_tokenizer(tokenizer_name: str) -> Callable:
139
+ """
140
+ Returns a tokenizer based on the given tokenizer name.
141
+
142
+ Parameters:
143
+ tokenizer_name (str): The name of the tokenizer. Acceptable values are:
144
+ - "tiktoken"
145
+ - "chatgpt"
146
+ - "gpt-3.5-turbo"
147
+ - "gpt-4"
148
+ - "gpt-4-turbo"
149
+ - "gpt-4o"
150
+ - "cl100k_base"
151
+ - Any valid tokenizer name recognized by the transformers library.
152
+
153
+ Returns:
154
+ Callable: The tokenizer instance.
155
+ """
156
+ if tokenizer_name.lower() in [
157
+ "tiktoken",
158
+ "chatgpt",
159
+ "gpt-3.5-turbo",
160
+ "gpt-4",
161
+ "gpt-4-turbo",
162
+ "gpt-4o",
163
+ "cl100k_base",
164
+ ]:
165
+ return tiktoken.get_encoding("cl100k_base")
166
+ print(tokenizer_name)
167
+ return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
168
+
169
+
170
+ def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
171
+ """
172
+ Retrieve and filter relevant EHR visits based on a query and target length.
173
+
174
+ This function retrieves electronic health record (EHR) visit strings, sorts them
175
+ by relevance using the BM25Retriever, and constructs a list of final documents
176
+ that fit within a specified character length. The final list ensures that the
177
+ most important visit isn't cut off and is sorted chronologically.
178
+
179
+ Parameters:
180
+ ehr_visit_strs (list of str): List of EHR visit strings.
181
+ query (str): Query string to retrieve relevant visits.
182
+ target_length (int): Maximum total token count for the final list of documents.
183
+ tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
184
+
185
+ Returns:
186
+ list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
187
+ """
188
+ ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
189
+ langchain_docs = [
190
+ langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
191
+ ]
192
+ # `k` is the number of documents to retrieve
193
+ # We retrieve everything and just use the BM25Retriever to sort the documents
194
+ retriever = langchain_community.retrievers.BM25Retriever.from_documents(
195
+ langchain_docs, k=len(langchain_docs)
196
+ )
197
+
198
+ # Invoking the retriever means the most relevant documents are sorted first
199
+ sorted_docs = retriever.invoke(query)
200
+
201
+ # Define the regex pattern to find the start time
202
+ # pattern = r'start="([\d/]+ [\d:]+)"'
203
+ pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
204
+
205
+ docs = []
206
+ dts = []
207
+
208
+ # Find the startime of the document
209
+ for doc in sorted_docs:
210
+ doc_content = doc.page_content
211
+ start_dt_match = re.search(pattern, doc_content)
212
+ if start_dt_match:
213
+ start_dt = start_dt_match.group(1)
214
+ parsed = False
215
+ # Try different date formats
216
+ for fmt in (
217
+ "%m/%d/%y %I:%M %p",
218
+ "%m/%d/%Y %I:%M %p",
219
+ "%m/%d/%y %H:%M",
220
+ "%m/%d/%Y %H:%M",
221
+ ):
222
+ try:
223
+ dts.append(datetime.datetime.strptime(start_dt, fmt))
224
+ parsed = True
225
+ break
226
+ except ValueError:
227
+ continue
228
+ if not parsed:
229
+ print(f"Error parsing date: {start_dt}")
230
+ continue
231
+ else:
232
+ print(f"Start time not found., {doc_content}")
233
+ dts.append(datetime.datetime.min)
234
+ docs.append(doc_content)
235
+
236
+ final_docs = []
237
+ current_length = 0
238
+
239
+ # Add documents until we exceed the allocated context length
240
+ for i in range(len(docs)):
241
+ doc_content = docs[i]
242
+ doc_length = len(tokenizer.encode(doc_content))
243
+ final_docs.append((dts[i], doc_content))
244
+ current_length += doc_length
245
+ if current_length > target_length:
246
+ break
247
+
248
+ # Sort final_docs chronologically
249
+ final_docs.sort(key=lambda x: x[0])
250
+
251
+ # Extract only the document content for the final output
252
+ final_docs_content = [doc_content for _, doc_content in final_docs]
253
+
254
+ return final_docs_content
255
+
256
+
257
+
258
+ def pack_and_trim_prompts(
259
+ instructions: Dict[int, Dict[str, str]],
260
+ ehrs: Dict[int, str],
261
+ prompt_template: langchain.prompts.PromptTemplate,
262
+ context_length: int,
263
+ generation_length: int,
264
+ tokenizer: Any,
265
+ use_RAG: bool = True,
266
+ verbose: bool = False,
267
+ include_ehr: bool = True,
268
+ ) -> Dict[int, str]:
269
+ """
270
+ Returns:
271
+ A map from Instruction ID to prompt
272
+ """
273
+ prompts_map = {}
274
+ for instruction_id in tqdm(instructions.keys()):
275
+ instruction = instructions[instruction_id]["instruction"]
276
+ patient_id = int(instructions[instruction_id]["patient_id"])
277
+ relevant_ehr = ehrs[patient_id]
278
+
279
+ # Calculate how many tokens of EHR we can include in the prompt
280
+ num_tokens_instruction = len(tokenizer.encode(instruction))
281
+ num_tokens_prompt_template = len(tokenizer.encode(prompt_template.template))
282
+ if include_ehr:
283
+ target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
284
+ else:
285
+ target_ehr_length = 0
286
+ if target_ehr_length <= 0:
287
+ prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr="")
288
+ else:
289
+ if use_RAG:
290
+ # Return a list of the most relevant visit strings
291
+ most_relevant_visits = retrieve_most_relevant_visits(
292
+ ehr_visit_strs=relevant_ehr,
293
+ query=instruction,
294
+ target_length=target_ehr_length,
295
+ tokenizer=tokenizer,
296
+ )
297
+ relevant_ehr = "\n".join(most_relevant_visits)
298
+
299
+ # Do a first pass with a fast tokenizer
300
+ fast_tokenizer = tiktoken.get_encoding("cl100k_base")
301
+ fast_encoded = fast_tokenizer.encode(relevant_ehr)
302
+ if len(fast_encoded) <= target_ehr_length:
303
+ fast_encoded_truncated = fast_encoded[-(2 * target_ehr_length) :]
304
+ fast_truncated_ehr = fast_tokenizer.decode(fast_encoded_truncated)
305
+
306
+ # Then do a second pass with the actual tokenizer
307
+ encoded_ehr = tokenizer.encode(fast_truncated_ehr)
308
+ truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
309
+ truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
310
+ prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr=truncated_ehr)
311
+
312
+ prompts_map[instruction_id] = prompt_with_truncated_ehr
313
+
314
+ if verbose:
315
+ print(prompt_with_truncated_ehr)
316
+ print("~" * 20)
317
+ return prompts_map
318
+
319
+
320
+ def preprocess_prompts(
321
+ target_context_length,
322
+ generation_length,
323
+ path_to_instructions,
324
+ path_to_ehrs,
325
+ use_RAG,
326
+ include_ehr,
327
+ tokenizer,
328
+ codes_only=False,
329
+ notes_only=False,
330
+ ):
331
+ print(
332
+ f"\n\twith target context length = {target_context_length} "
333
+ f"\n\twith target generation length = {generation_length} "
334
+ )
335
+
336
+ # FETCH INSTRUCTIONS
337
+ print("Fetching instructions...")
338
+ instructions = get_instructions(path_to_instructions)
339
+
340
+ # FETCH RELEVANT EHRs #
341
+ print("Fetching patient EHR timelines...")
342
+ ehrs = get_ehrs(path_to_ehrs)
343
+
344
+ # LOAD TOKENIZER #
345
+ print("Loading tokenizer...")
346
+ tokenizer = get_tokenizer(tokenizer)
347
+
348
+ # CONSTRUCT & TRUNCATE PROMPTS #
349
+ print("Constructing prompts using instructions and EHRs...")
350
+ prompt_string="Instruction: Answer the following question based on the EHR:\n\nEHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
351
+ prompt_template = langchain.prompts.PromptTemplate.from_template(prompt_string)
352
+ filled_prompts = pack_and_trim_prompts(
353
+ instructions=instructions,
354
+ ehrs=ehrs,
355
+ prompt_template=prompt_template,
356
+ context_length=target_context_length,
357
+ generation_length=generation_length,
358
+ tokenizer=tokenizer,
359
+ use_RAG=use_RAG,
360
+ verbose=False,
361
+ include_ehr=include_ehr,
362
+ )
363
+ assert filled_prompts, f"No prompts were found for length: {target_context_length}. Try again with a larger length."
364
+ # SAVE CONSTRUCTED PROMPTS TO DISK
365
+ df_rows = []
366
+ for instruction_id in tqdm(filled_prompts.keys()):
367
+ row = {}
368
+ row["instruction_id"] = instruction_id
369
+ patient_id = instructions[instruction_id]["patient_id"]
370
+ row["patient_id"] = patient_id
371
+ row["instruction"] = instructions[instruction_id]["instruction"]
372
+ row["ehr"] = "".join(ehrs[patient_id])
373
+ row["prompt"] = filled_prompts[instruction_id]
374
+ row["context_length"] = target_context_length
375
+ row["generation_length"] = generation_length
376
+ df_rows.append(row)
377
+
378
+ prompts_df = pd.DataFrame(df_rows)
379
+ instructionid_to_prompt_map = (
380
+ prompts_df[["instruction_id", "prompt"]].set_index("instruction_id").to_dict().get("prompt")
381
+ )
382
+ instructionid_to_prompt_df = (
383
+ pd.DataFrame.from_dict(instructionid_to_prompt_map, orient="index", columns=["prompt"])
384
+ .reset_index()
385
+ .rename(columns={"index": "instruction_id"})
386
+ )
387
+
388
+ print("...Prompt construction complete")
389
+ return instructionid_to_prompt_df
390
+
391
+
392
+ def add_reference_responses(prompts_df, path_to_reference_responses) -> pd.DataFrame:
393
+ """
394
+ Processes a single file for evaluation.
395
+
396
+ Parameters:
397
+ file_path (str): Path to the file to be processed.
398
+ args (argparse.Namespace): Command line arguments passed to the script.
399
+
400
+ Returns:
401
+ pd.DataFrame: DataFrame containing the processed data.
402
+ """
403
+ gold_df = pd.read_csv(path_to_reference_responses, sep='\t')
404
+ gold_df = gold_df.query("annotator_num == 'Annotator_1'")
405
+ gold_df = gold_df[["instruction_id", "clinician_response"]]
406
+ merged_df = gold_df.merge(prompts_df, on="instruction_id", how="inner")
407
+ return merged_df
408
+
409
+
410
+ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
411
+ target_context_length = max_length
412
+ generation_length = 256
413
+ path_to_instructions = os.path.join(data_path, "clinician-reviewed-model-responses.tsv")
414
+ check_file_exists(path_to_instructions, msg=f"[MedAlignScenario] Required instructions file not found: '{path_to_instructions}'")
415
+ path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
416
+ path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
417
+ check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
418
+ use_RAG = False
419
+ include_ehr = True
420
+ tokenizer = "tiktoken"
421
+
422
+ instructionid_to_prompt_df = preprocess_prompts(
423
+ target_context_length=target_context_length,
424
+ generation_length=generation_length,
425
+ path_to_instructions=path_to_instructions,
426
+ path_to_ehrs=path_to_ehrs,
427
+ use_RAG=use_RAG,
428
+ include_ehr=include_ehr,
429
+ tokenizer=tokenizer,
430
+ )
431
+ medalign_dataframe = add_reference_responses(instructionid_to_prompt_df, path_to_reference_responses)
432
+ return medalign_dataframe
@@ -0,0 +1,145 @@
1
+ import os
2
+ import csv
3
+ import sys
4
+ from typing import List
5
+
6
+ from helm.benchmark.scenarios.scenario import (
7
+ CORRECT_TAG,
8
+ TEST_SPLIT,
9
+ Input,
10
+ Instance,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ )
15
+ from helm.common.general import ensure_file_downloaded
16
+
17
+ csv.field_size_limit(sys.maxsize)
18
+
19
+
20
+ class MedBulletsScenario(Scenario):
21
+ """
22
+ From "Benchmarking Large Language Models on Answering and Explaining Challenging Medical Questions"
23
+ (Chen et al.), MedBullet is a dataset comprising USMLE Step 2&3 style clinical questions. The dataset
24
+ is designed to evaluate the performance of LLMs in answering and explaining challenging medical questions,
25
+ emphasizing the need for explainable AI in medical QA.
26
+
27
+ Example from the dataset:
28
+
29
+ Question:
30
+ A 42-year-old woman is enrolled in a randomized controlled trial to study cardiac function in the setting of
31
+ several different drugs. She is started on verapamil and instructed to exercise at 50% of her VO2 max while
32
+ several cardiac parameters are being measured. During this experiment, which of the following represents
33
+ the relative conduction speed through the heart from fastest to slowest?
34
+
35
+ A) AV node > ventricles > atria > Purkinje fibers
36
+ B) Purkinje fibers > ventricles > atria > AV node
37
+ C) Purkinje fibers > atria > ventricles > AV node
38
+ D) Purkinje fibers > AV node > ventricles > atria
39
+
40
+ Answer:
41
+ The answer is C. Explanation: The conduction velocity of the structures of the heart is in the following order:
42
+ Purkinje fibers > atria > ventricles > AV node. A calcium channel blocker such as verapamil would only slow
43
+ conduction in the AV node.
44
+
45
+ @Article{MedBullet,
46
+ author = {Hanjie Chen and Zhouxiang Fang and Yash Singla and Mark Dredze},
47
+ title = {Benchmarking Large Language Models on Answering and Explaining Challenging Medical Questions},
48
+ year = {2023},
49
+ abstract = {LLMs have demonstrated impressive performance in answering medical questions, such as passing scores
50
+ on medical licensing examinations. However, medical board exam questions or general clinical questions do not
51
+ capture the complexity of realistic clinical cases. Moreover, the lack of reference explanations means we cannot
52
+ easily evaluate the reasoning of model decisions, a crucial component of supporting doctors in making complex
53
+ medical decisions. To address these challenges, we construct two new datasets: JAMA Clinical Challenge and
54
+ Medbullets. JAMA Clinical Challenge consists of questions based on challenging clinical cases, while Medbullets
55
+ comprises USMLE Step 2&3 style clinical questions. Both datasets are structured as multiple-choice
56
+ question-answering tasks, where each question is accompanied by an expert-written explanation. We evaluate four
57
+ LLMs on the two datasets using various prompts. Experiments demonstrate that our datasets are harder than
58
+ previous benchmarks. The inconsistency between automatic and human evaluations of model-generated explanations
59
+ highlights the need to develop new metrics to support future research on explainable medical QA.}}
60
+
61
+ Task:
62
+ Given a clinical question with multiple-choice options, models must identify the correct answer and generate a
63
+ response that includes the reasoning, as described in the expert-written explanation.
64
+ """
65
+
66
+ DATASET_DOWNLOAD_BASE_URL = (
67
+ "https://raw.githubusercontent.com/HanjieChen/ChallengeClinicalQA/refs/heads/main/medbullets/"
68
+ )
69
+
70
+ name = "medbullets"
71
+ description = (
72
+ "Medbullets is a benchmark of USMLE-style medical questions designed to assess a"
73
+ "model’s ability to understand and apply clinical knowledge. Each question is accompanied"
74
+ "by a patient scenario and five multiple-choice options, similar to those found on"
75
+ "Step 2 and Step 3 on the US medical licensing exam."
76
+ )
77
+ tags = ["reasoning", "biomedical"]
78
+
79
+ # Define the possible answer choices
80
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C", "D", "E"]
81
+
82
+ def __init__(self):
83
+ super().__init__()
84
+ # self.splits = {"_op4": TRAIN_SPLIT, "_op5": TEST_SPLIT}
85
+ # limit to zero shot setting for now
86
+ self.splits = {"_op5": TEST_SPLIT}
87
+
88
+ def download_csv(self, output_path: str, split: str):
89
+ """Download CSV files for the given split."""
90
+ csv_path = os.path.join(output_path, f"medbullets{split}.csv")
91
+ ensure_file_downloaded(
92
+ source_url=f"{self.DATASET_DOWNLOAD_BASE_URL}/medbullets{split}.csv",
93
+ target_path=csv_path,
94
+ unpack=False,
95
+ )
96
+ return csv_path
97
+
98
+ def process_csv(self, csv_path: str, split: str) -> List[Instance]:
99
+ """Read and process a CSV file to generate instances."""
100
+ instances: List[Instance] = []
101
+ with open(csv_path, "r", encoding="utf-8") as f:
102
+ reader = csv.DictReader(f)
103
+ for row in reader:
104
+ # Validate required fields
105
+ if not row.get("question") or not row.get("answer_idx") or not row.get("opa"):
106
+ print(f"Skipping invalid row: {row}")
107
+ continue
108
+
109
+ # Map answers to indices
110
+ option_map = {
111
+ "A": row.get("opa", "Not applicable"),
112
+ "B": row.get("opb", "Not applicable"),
113
+ "C": row.get("opc", "Not applicable"),
114
+ "D": row.get("opd", "Not applicable"),
115
+ "E": row.get("ope", "Not applicable"),
116
+ }
117
+
118
+ # Correct answer text
119
+ correct_option = row["answer_idx"]
120
+
121
+ # Build references using POSSIBLE_ANSWER_CHOICES
122
+ references = [
123
+ Reference(
124
+ Output(text=option_map.get(option, "Not applicable")),
125
+ tags=[CORRECT_TAG] if option == correct_option else [],
126
+ )
127
+ for option in self.POSSIBLE_ANSWER_CHOICES
128
+ ]
129
+
130
+ # Create instance
131
+ instance = Instance(
132
+ input=Input(text=row["question"]),
133
+ references=references,
134
+ split=split,
135
+ )
136
+ instances.append(instance)
137
+ return instances
138
+
139
+ def get_instances(self, output_path: str) -> List[Instance]:
140
+ """Download and process dataset to generate instances."""
141
+ instances: List[Instance] = []
142
+ for split_suffix, split in self.splits.items():
143
+ csv_path = self.download_csv(output_path, split_suffix)
144
+ instances.extend(self.process_csv(csv_path, split))
145
+ return instances