crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass, field, replace
3
- from typing import List, Optional, Tuple
3
+ from typing import Dict, List, Optional, Tuple, Any
4
4
  import os
5
5
  from pathlib import PurePath
6
6
  import inspect
@@ -67,6 +67,11 @@ class Input:
67
67
  multimedia_content: Optional[MultimediaObject] = None
68
68
  """A single input can consists of multimodal content interleaved (e.g., text, image, text, ...)."""
69
69
 
70
+ messages: Optional[List[Dict[str, str]]] = None
71
+ """Used for chat models.
72
+ If messages is specified for a chat model, the prompt is ignored.
73
+ Otherwise, the client should convert the prompt into a message."""
74
+
70
75
 
71
76
  @dataclass(frozen=True)
72
77
  class PassageQuestionInput(Input):
@@ -153,6 +158,9 @@ class Instance:
153
158
  contrast_references: Optional[List[List[Reference]]] = None
154
159
  """References for the perturbed input above (if available)"""
155
160
 
161
+ extra_data: Optional[Dict[str, Any]] = None
162
+ """Extra data required by the scenario e.g. chain-of-thought annotations"""
163
+
156
164
  @property
157
165
  def first_correct_reference(self) -> Optional[Reference]:
158
166
  """Return the first correct reference."""
@@ -19,7 +19,7 @@ from helm.benchmark.scenarios.scenario import (
19
19
  from helm.common.general import ensure_file_downloaded
20
20
  from helm.common.hierarchical_logger import hlog
21
21
 
22
- # BHASA Scenarios
22
+ # SEA-HELM Scenarios
23
23
  # A. Natural Language Understanding
24
24
  # B. Natural Language Generation
25
25
  # C. Natural Language Reasoning
@@ -95,7 +95,12 @@ class TyDiQAScenario(Scenario):
95
95
  self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
96
96
 
97
97
  def get_instances(self, output_path) -> List[Instance]:
98
- dataset = datasets.load_dataset("khalidalt/tydiqa-goldp", "indonesian")
98
+ dataset = datasets.load_dataset(
99
+ "khalidalt/tydiqa-goldp",
100
+ "indonesian",
101
+ revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
102
+ trust_remote_code=True,
103
+ )
99
104
 
100
105
  outputs = []
101
106
  for split in self.splits.keys():
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
6
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
7
 
8
8
 
9
9
  class SelfInstructScenario(Scenario):
@@ -0,0 +1,69 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCBMTMedScenario(Scenario):
19
+ """
20
+ This benchmark dataset was built from a patient status gold-standard
21
+ for specific questions asked after a bone marrow transplant has taken place.
22
+ """
23
+
24
+ name = "shc_bmt_med"
25
+ description = (
26
+ "A dataset containing patient notes with associated "
27
+ "questions and answers related to bone marrow transplantation."
28
+ )
29
+ tags = ["knowledge", "reasoning", "biomedical"]
30
+
31
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
32
+
33
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
34
+ data = {}
35
+ with open(csv_path, "r") as file:
36
+ reader = csv.DictReader(file)
37
+ for row in reader:
38
+ question = row["prompt"]
39
+ context = row["context"]
40
+ answer = row["label"]
41
+ prompt = (
42
+ f"Provide an answer to the following {question} with the following context: {context} "
43
+ ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
44
+ "details or response, just a simple A or B response."
45
+ )
46
+ data[prompt] = answer
47
+ return data
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-BMT-dataset_filtered.csv"
51
+
52
+ instances: List[Instance] = []
53
+ benchmark_data = self.create_benchmark(data_path)
54
+
55
+ for prompt, answer in benchmark_data.items():
56
+ assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
57
+ references: List[Reference] = [
58
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
59
+ for pred_answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
60
+ ]
61
+ instances.append(
62
+ Instance(
63
+ input=Input(text=prompt),
64
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
65
+ split=TEST_SPLIT,
66
+ )
67
+ )
68
+
69
+ return instances
@@ -0,0 +1,70 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCCDIMedScenario(Scenario):
19
+ """
20
+ This benchmark dataset was built from Clinical Document Integrity (CDI)
21
+ notes were there are verifications of clinical activities. The idea behind
22
+ it was to assess an LLM capability to answer these questions from previous notes.
23
+ """
24
+
25
+ name = "shc_cdi_med"
26
+ description = (
27
+ "A dataset built from Clinical Document Integrity (CDI) notes, to assess "
28
+ "the ability to answer verification questions from previous notes."
29
+ )
30
+ tags = ["knowledge", "reasoning", "biomedical"]
31
+
32
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
+
34
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
35
+ data = {}
36
+ with open(csv_path, "r") as file:
37
+ reader = csv.DictReader(file)
38
+ for row in reader:
39
+ question = row["prompt"]
40
+ context = row["context"]
41
+ answer = row["label"]
42
+ prompt = (
43
+ f"Provide an answer to the following {question} with the following context: {context} , "
44
+ "Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
45
+ "additional details or response, just a simple A or B response."
46
+ )
47
+ data[prompt] = answer
48
+ return data
49
+
50
+ def get_instances(self, output_path: str) -> List[Instance]:
51
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CDI-dataset_filtered.csv"
52
+
53
+ instances: List[Instance] = []
54
+ benchmark_data = self.create_benchmark(data_path)
55
+
56
+ for prompt, answer in benchmark_data.items():
57
+ assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
58
+ references: List[Reference] = [
59
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
60
+ for pred_answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
61
+ ]
62
+ instances.append(
63
+ Instance(
64
+ input=Input(text=prompt),
65
+ references=references,
66
+ split=TEST_SPLIT,
67
+ )
68
+ )
69
+
70
+ return instances
@@ -0,0 +1,70 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCCONFMedScenario(Scenario):
19
+ """
20
+ Benchmark derived from extracting confidential information from clinical notes.
21
+ From Evaluation of a Large Language Model to Identify Confidential Content in
22
+ Adolescent Encounter Notes published at https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109
23
+ """
24
+
25
+ name = "shc_conf_med"
26
+ description = (
27
+ "A dataset of clinical notes from adolescent patients used to identify sensitive "
28
+ "protected health information that should be restricted from parental access."
29
+ )
30
+ tags = ["knowledge", "reasoning", "biomedical"]
31
+
32
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
+
34
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
35
+ data = {}
36
+ with open(csv_path, "r") as file:
37
+ reader = csv.DictReader(file)
38
+ for row in reader:
39
+ question = row["prompt"]
40
+ context = row["context"]
41
+ answer = row["label"]
42
+ prompt = (
43
+ f"Provide an answer to the following {question} with the following context: {context} "
44
+ ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
45
+ "details or response, just a simple A or B response."
46
+ )
47
+ data[prompt] = answer
48
+ return data
49
+
50
+ def get_instances(self, output_path: str) -> List[Instance]:
51
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CONF-dataset_filtered.csv"
52
+
53
+ instances: List[Instance] = []
54
+ benchmark_data = self.create_benchmark(data_path)
55
+
56
+ for prompt, answer in benchmark_data.items():
57
+ assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
58
+ references: List[Reference] = [
59
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
60
+ for pred_answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
61
+ ]
62
+ instances.append(
63
+ Instance(
64
+ input=Input(text=prompt),
65
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
66
+ split=TEST_SPLIT,
67
+ )
68
+ )
69
+
70
+ return instances
@@ -0,0 +1,72 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCENTMedScenario(Scenario):
19
+ """
20
+ This benchmark dataset was built to assess the capabilities "
21
+ "of an LLM for referral to the Ear, Nose and Throat department.
22
+ """
23
+
24
+ name = "shc_ent_med"
25
+ description = (
26
+ "A dataset designed to evaluate performance in "
27
+ "identifying appropriate patient referrals to Ear, Nose, and Throat specialists."
28
+ )
29
+ tags = ["knowledge", "reasoning", "biomedical"]
30
+
31
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
32
+
33
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
34
+ data = {}
35
+ counter = 1
36
+ with open(csv_path, "r") as file:
37
+ reader = csv.DictReader(file)
38
+ for row in reader:
39
+ if row["label"] != "": # skip rows with character/encoding issues - 79
40
+ question = row["prompt"]
41
+ context = row["context"]
42
+ answer = row["label"]
43
+ prompt = (
44
+ f"{counter} Provide an answer to the following {question} with the following context:"
45
+ f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
46
+ " Do not provide any additional details or response, just a simple A, B, or C response."
47
+ )
48
+ data[prompt] = answer
49
+ counter = counter + 1
50
+ return data
51
+
52
+ def get_instances(self, output_path: str) -> List[Instance]:
53
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-ENT-dataset_filtered.csv"
54
+
55
+ instances: List[Instance] = []
56
+ benchmark_data = self.create_benchmark(data_path)
57
+
58
+ for prompt, answer in benchmark_data.items():
59
+ assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
60
+ references: List[Reference] = [
61
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
62
+ for pred_answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
63
+ ]
64
+ instances.append(
65
+ Instance(
66
+ input=Input(text=prompt),
67
+ references=references,
68
+ split=TEST_SPLIT,
69
+ )
70
+ )
71
+
72
+ return instances
@@ -0,0 +1,66 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCGIPMedScenario(Scenario):
19
+ """
20
+ This benchmark dataset was built from a patient referral gold-standard set
21
+ to a specialty clinic to verify the ability of LLMs for patient hospice referral purposes.
22
+ """
23
+
24
+ name = "shc_gip_med"
25
+ description = "A dataset evaluating performance in identifying appropriate patient referrals to hospice care."
26
+ tags = ["knowledge", "reasoning", "biomedical"]
27
+
28
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
29
+
30
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
31
+ data = {}
32
+ with open(csv_path, "r") as file:
33
+ reader = csv.DictReader(file)
34
+ for row in reader:
35
+ question = row["prompt"]
36
+ context = row["context"]
37
+ answer = row["label"]
38
+ prompt = (
39
+ f"Provide an answer to the following {question} with the following context: {context} "
40
+ ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
41
+ "details or response, just a simple A or B response."
42
+ )
43
+ data[prompt] = answer
44
+ return data
45
+
46
+ def get_instances(self, output_path: str) -> List[Instance]:
47
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-GIP-dataset_filtered.csv"
48
+
49
+ instances: List[Instance] = []
50
+ benchmark_data = self.create_benchmark(data_path)
51
+
52
+ for prompt, answer in benchmark_data.items():
53
+ assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
54
+ references: List[Reference] = [
55
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
56
+ for pred_answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
57
+ ]
58
+ instances.append(
59
+ Instance(
60
+ input=Input(text=prompt),
61
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
62
+ split=TEST_SPLIT,
63
+ )
64
+ )
65
+
66
+ return instances
@@ -0,0 +1,76 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCPTBMMedScenario(Scenario):
19
+ """
20
+ This dataset contains clinical notes from primary care visit encounters of
21
+ children ages 4-6 years old with ADHD seen at Stanford's community-based primary
22
+ care network, Packard Children's Health Alliance, between 2015-2019. In this classification
23
+ task, the LLM is tasked with classifying whether the note contains clinician recommendation
24
+ for parent training in behavior management, which is the first-line evidence-based treatment
25
+ for young children with ADHD. From publication: https://doi.org/10.1093/jamia/ocae001
26
+ """
27
+
28
+ name = "shc_ptbm_med"
29
+ description = (
30
+ "A dataset that classifies whether a clinical note contains a clinician "
31
+ "recommendation for parent training in behavior management, which is the first-line "
32
+ "evidence-based treatment for young children with ADHD."
33
+ )
34
+ tags = ["knowledge", "reasoning", "biomedical"]
35
+
36
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
37
+
38
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
39
+ data = {}
40
+ with open(csv_path, "r") as file:
41
+ reader = csv.DictReader(file)
42
+ for row in reader:
43
+ question = row["prompt"]
44
+ context = row["context"]
45
+ answer = row["label"]
46
+ prompt = (
47
+ "You are reviewing a clinical note from health records of children with "
48
+ "attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
49
+ f"behavioral therapy. Provide an answer to the following {question} with the "
50
+ f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
51
+ "for no. Do not provide any additional details or response, just a simple A or B response."
52
+ )
53
+ data[prompt] = answer
54
+ return data
55
+
56
+ def get_instances(self, output_path: str) -> List[Instance]:
57
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-PTBM-dataset_filtered.csv"
58
+
59
+ instances: List[Instance] = []
60
+ benchmark_data = self.create_benchmark(data_path)
61
+
62
+ for prompt, answer in benchmark_data.items():
63
+ assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
64
+ references: List[Reference] = [
65
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
66
+ for pred_answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
67
+ ]
68
+ instances.append(
69
+ Instance(
70
+ input=Input(text=prompt),
71
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
72
+ split=TEST_SPLIT,
73
+ )
74
+ )
75
+
76
+ return instances
@@ -0,0 +1,89 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCSEIMedScenario(Scenario):
19
+ """
20
+ This dataset contains clinical notes from primary care visit encounters
21
+ (in-person/telehealth and telephone) of children ages 6-11 years old with ADHD
22
+ seen at Stanford's community-based primary care network, Packard Children's Health Alliance,
23
+ between 2015-2022. All children in this dataset were prescribed at least once an ADHD
24
+ medication (stimulants or non-stimulants) by a primary care clinician. In this
25
+ classification task, the LLM is tasked with classifying whether the note contains
26
+ documentation of side effect monitoring (recording of absence or presence of
27
+ medication side effects), as recommended in clinical practice guidelines.
28
+ From publication: https://doi.org/10.1542/peds.2024-067223
29
+ """
30
+
31
+ name = "shc_sei_med"
32
+ description = (
33
+ "A dataset that classifies whether a clinical note contains documentation "
34
+ "of side effect monitoring (recording of absence or presence of medication "
35
+ "side effects), as recommended in clinical practice guidelines."
36
+ )
37
+ tags = ["knowledge", "reasoning", "biomedical"]
38
+
39
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
40
+
41
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
42
+ data = {}
43
+ with open(csv_path, "r") as file:
44
+ reader = csv.DictReader(file)
45
+ for row in reader:
46
+ question = row["prompt"]
47
+ context = row["context"]
48
+ answer = row["label"]
49
+ prompt = (
50
+ "You are reviewing a clinical note from health records of children "
51
+ "with attention deficit hyperactivity disorder (ADHD). Given the following "
52
+ "definitions: side Effects Inquiry (SEI): Explicit documentation by the clinician "
53
+ "asking about current side effects related to ADHD medications that the child is "
54
+ "taking or documentation of specific ADHD medication side effects experienced "
55
+ "by the patient. SEI does *not* include future side effects monitoring, "
56
+ "such as documentation of potential ADHD medication side effects, including "
57
+ "planning to follow patients to monitor side effects, explaining about "
58
+ "potential side effects of an ADHD medication. These documentations are not "
59
+ "categorized as SEI because they consist of a plan or an explanation about "
60
+ "side effects without actual side effect monitoring taking place, and "
61
+ "No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
62
+ f"Provide an answer to the following {question} with the following context: {context} "
63
+ ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
64
+ "details or response, just a simple A or B response."
65
+ )
66
+ data[prompt] = answer
67
+ return data
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-SEI-dataset_filtered.csv"
71
+
72
+ instances: List[Instance] = []
73
+ benchmark_data = self.create_benchmark(data_path)
74
+
75
+ for prompt, answer in benchmark_data.items():
76
+ assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
77
+ references: List[Reference] = [
78
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
79
+ for pred_answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
80
+ ]
81
+ instances.append(
82
+ Instance(
83
+ input=Input(text=prompt),
84
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
85
+ split=TEST_SPLIT,
86
+ )
87
+ )
88
+
89
+ return instances
@@ -0,0 +1,69 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class SHCSequoiaMedScenario(Scenario):
19
+ """
20
+ Benchmark derived from manually curated answers to several questions for Sequoia clinic referrals
21
+ """
22
+
23
+ name = "shc_sequoia_med"
24
+ description = (
25
+ "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic."
26
+ )
27
+ tags = ["knowledge", "reasoning", "biomedical"]
28
+
29
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
30
+
31
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
32
+ data = {}
33
+ counter = 1
34
+ with open(csv_path, "r") as file:
35
+ reader = csv.DictReader(file) # , quoting=csv.QUOTE_MINIMAL
36
+ for row in reader:
37
+ question = row["question"]
38
+ context = row["context"]
39
+ answer = row["label"]
40
+ prompt = (
41
+ f" {counter} Provide an answer to the following {question} with the following context:"
42
+ f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
43
+ "additional details or response, just a simple A or B response."
44
+ )
45
+ data[prompt] = answer
46
+ counter += 1
47
+ return data
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-sequoia-dataset_filtered.csv"
51
+
52
+ instances: List[Instance] = []
53
+ benchmark_data = self.create_benchmark(data_path)
54
+
55
+ for prompt, answer in benchmark_data.items():
56
+ assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
57
+ references: List[Reference] = [
58
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
59
+ for pred_answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
60
+ ]
61
+ instances.append(
62
+ Instance(
63
+ input=Input(text=prompt),
64
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
65
+ split=TEST_SPLIT,
66
+ )
67
+ )
68
+
69
+ return instances
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  from datasets import load_dataset
3
3
 
4
- from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
4
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
5
5
 
6
6
 
7
7
  class SimpleSafetyTestsScenario(Scenario):