crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,240 @@
1
+ import os
2
+ import json
3
+ from random import Random
4
+ from typing import Any, List, Dict
5
+
6
+ import pandas as pd
7
+ from pandas import DataFrame
8
+
9
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
10
+ from helm.benchmark.scenarios.scenario import (
11
+ Scenario,
12
+ Instance,
13
+ Reference,
14
+ TRAIN_SPLIT,
15
+ TEST_SPLIT,
16
+ CORRECT_TAG,
17
+ Input,
18
+ Output,
19
+ )
20
+
21
+
22
+ class CtiToMitreScenario(Scenario):
23
+ """
24
+ Original Task:
25
+ - The original task is to classify the description of the situation regarding the system
26
+ into the security threats in that situation.
27
+ - The classification categories are the approximately 200 categories of attack techniques
28
+ in the enterprise as defined by MITRE ATT&CK v10.1.
29
+
30
+ Implemented Task:
31
+ - Since classification into so many classes is difficult to handle in a generative language model
32
+ such as GPT itself, we implement this task as a multiple-choice task.
33
+ - Each choice is the name of the attack technique category into which the description is classified.
34
+ - The number of options is determined by the parameter (num_options).
35
+ - The minimum number of options is 2 and the maximum is 199, the number of all categories of
36
+ attack methods defined in MITRE ATT&CK v10.1.
37
+ - From the 199 choices, num_options choices, including the correct answer and a default case,
38
+ are randomly selected and used.
39
+ - If num_options is not specified, all 199 category names will be used as choices.
40
+
41
+ Data:
42
+ - dataset.csv
43
+ - Target dataset
44
+ - https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/a8cacf3185d098c686e0d88768a619a03a4d76d1/data/dataset.csv
45
+ - This data is of the form [sentence, label_tec, label_subtec, tec_name]
46
+ - sentence: the description
47
+ - label_tec: label for attack technique category
48
+ - label_subtec: label for attack technique subcategory
49
+ - tec_name : name(simple description) for attack technique subcategory
50
+ - Note: we need to extract name for attack technique category
51
+ from enterprise-attack.json
52
+
53
+ - enterprise-attack.json
54
+ - https://github.com/mitre/cti/archive/refs/tags/ATT&CK-v10.1.zip
55
+ - /mitre_v10/enterprise-attack/enterprise-attack.json
56
+ - This data contains relation from attack technique name to attack technique label
57
+ - we can extract attack technique category name for label_tec using this json data.
58
+
59
+
60
+ Prompt: (k is specified by num_options)
61
+ -----------------------
62
+ Answer the possible security attacks in each of the following situations from each of the options below.
63
+ [instruction]
64
+
65
+ Situation: <description> [in context examples]
66
+ A. <attack_category_name_1>
67
+ B. <attack_category_name_2>
68
+ ...
69
+ Y. <attack_category_name_(k-1)>
70
+ Z. Others
71
+ Answer: <correct_answer>
72
+
73
+ ... (Examples are output as long as the length allows) ...
74
+
75
+ Situation: <target_description> [target question]
76
+ A. <attack_category_name_t1>
77
+ B. <attack_category_name_t2>
78
+ ...
79
+ Y. <attack_category_name_t(k-1)>
80
+ Z. Others
81
+ Answer:
82
+ -----------------------
83
+
84
+ Example of prompt (num_options = 5)
85
+ -----------------------
86
+ Answer the possible security attacks in each of the following situations from each of the options below.
87
+
88
+ Situation: ZxShell can launch a reverse command shell.
89
+ A. Command and Scripting Interpreter
90
+ B. System Shutdown/Reboot
91
+ C. Exfiltration Over C2 Channel
92
+ D. Direct Volume Access
93
+ E. Others
94
+ Answer: A
95
+
96
+ ....(Omitted)...
97
+
98
+ Situation: APC injection is a method of executing arbitrary code in the address space.
99
+ A. Event Triggered Execution
100
+ B. Process Injection
101
+ C. Non-Application Layer Protocol
102
+ D. Escape to Host
103
+ E. Others
104
+ Answer: B
105
+
106
+ Situation: Timestomping may be used along with file name Masquerading to hide malware and tools.
107
+ A. Search Victim-Owned Websites
108
+ B. Internal Spearphishing
109
+ C. Application Layer Protocol
110
+ D. Indicator Removal on Host
111
+ E. Others
112
+ Answer:
113
+ -----------------------
114
+ """ # noqa: E501
115
+
116
+ # Names of the tasks we support
117
+ name = "cti_to_mitre"
118
+ description = "Classification of security attack opportunities on system"
119
+ tags = ["classification", "cyber_security"]
120
+
121
+ # Constant for splitting target data into train and test data.
122
+ train_ratio = 0.7
123
+
124
+ # Constant for default number of options. # of (MITRE ATT&CK attack categories) is 199 in ATT&CK-v10.1.zip
125
+ MAX_NUM_OPTIONS = 199
126
+
127
+ # Constant: the description for Others option
128
+ OTHERS_OPTION = "Others"
129
+
130
+ CTI_URL = "https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/a8cacf3185d098c686e0d88768a619a03a4d76d1/data/dataset.csv" # noqa: E501
131
+ MITRE_URL = "https://github.com/mitre/cti/raw/refs/tags/ATT&CK-v10.1/enterprise-attack/enterprise-attack.json"
132
+
133
+ def __init__(self, num_options: int = MAX_NUM_OPTIONS, seed: int = 42) -> None:
134
+ """
135
+ num_options: int, number of choices in multiple-choice task
136
+ seed: int, seed for random module. The seed is set to random if specified
137
+ """
138
+ super().__init__()
139
+ self.num_options = min(num_options, CtiToMitreScenario.MAX_NUM_OPTIONS)
140
+ self.random_seed = seed
141
+ self.random = Random(seed)
142
+
143
+ @staticmethod
144
+ def make_label_category_name_dict(jdata: Dict[str, Any]) -> Dict[str, str]:
145
+ """
146
+ This makes mapping from label_tec (attack technique category label) to tec_category_name
147
+ (attack technique category name)
148
+ - jdata is json object for enterprise_attack.json
149
+ """
150
+
151
+ category_id_to_name: Dict[str, str] = {}
152
+ attacks = [
153
+ o for o in jdata["objects"] if o["type"] == "attack-pattern" and not o.get("x_mitre_is_subtechnique", True)
154
+ ]
155
+ for attack in attacks:
156
+ ids = [ref["external_id"] for ref in attack["external_references"] if ref["source_name"] == "mitre-attack"]
157
+ assert len(ids) == 1
158
+ id = ids[0]
159
+ category_id_to_name[id] = attack["name"]
160
+ return category_id_to_name
161
+
162
+ def get_references(self, num_references: int, correct_cname: str, cnames: List[str]) -> List[Reference]:
163
+ """
164
+ Randomly select k tec_category_names (attack technique category names) as choices.
165
+ However, choose not to include "excluded",
166
+ and if k is less than the total number of possible choices, add a default case.
167
+ - k : number of choices
168
+ - correct_cname : correct attack technique category names
169
+ - cnames : list containing all attack technique category names
170
+ """
171
+ assert num_references >= 2, "Need at least 2 references for the correct choice and 'Others'"
172
+ num_incorrect_cname_samples = num_references - 2
173
+ assert num_references <= len(
174
+ cnames
175
+ ), f"Cannot have more references than the number of categories, which is {len(cnames)}"
176
+ incorrect_cnames = [cname for cname in cnames if cname != correct_cname]
177
+ incorrect_cname_samples = self.random.sample(
178
+ incorrect_cnames, min(len(incorrect_cnames), num_incorrect_cname_samples)
179
+ )
180
+ references = [Reference(Output(text=cname), tags=[]) for cname in incorrect_cname_samples]
181
+ references.append(Reference(Output(text=correct_cname), tags=[CORRECT_TAG]))
182
+ self.random.shuffle(references)
183
+ if num_references <= len(cnames):
184
+ references.append(Reference(Output(text=CtiToMitreScenario.OTHERS_OPTION), tags=[]))
185
+ return references
186
+
187
+ def create_multiple_choice_instances(
188
+ self, df: DataFrame, split: str, label_cname: Dict[str, str]
189
+ ) -> List[Instance]:
190
+ """Create a list of instances corresponding to the multiple choice task"""
191
+ instances = []
192
+ for idx in df.index:
193
+ linedata = df.loc[idx]
194
+ sentence = linedata["sentence"]
195
+ label_tec = linedata["label_tec"]
196
+ correct_cname = label_cname[label_tec]
197
+ all_cnames = [cname for cname in label_cname.values()]
198
+ references = self.get_references(self.num_options, correct_cname, all_cnames)
199
+ input = Input(text=sentence)
200
+ instance = Instance(input, references, split=split)
201
+ instances.append(instance)
202
+ return instances
203
+
204
+ def get_instances(self, output_path: str) -> List[Instance]:
205
+ data_dir = os.path.join(output_path, "data")
206
+ ensure_directory_exists(data_dir)
207
+
208
+ dataset_path = os.path.join(data_dir, "dataset.csv")
209
+ ensure_file_downloaded(
210
+ source_url="https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/a8cacf3185d098c686e0d88768a619a03a4d76d1/data/dataset.csv", # noqa: E501
211
+ target_path=dataset_path,
212
+ )
213
+
214
+ labels_path = os.path.join(data_dir, "enterprise-attack.json")
215
+ ensure_file_downloaded(
216
+ source_url="https://github.com/mitre/cti/raw/refs/tags/ATT&CK-v10.1/enterprise-attack/enterprise-attack.json", # noqa: E501
217
+ target_path=labels_path,
218
+ )
219
+
220
+ # load dataset
221
+ all_df = pd.read_csv(dataset_path)
222
+
223
+ # split all_df into train and test data frames
224
+ train_df = all_df.sample(frac=CtiToMitreScenario.train_ratio, random_state=self.random_seed)
225
+ test_df = all_df.drop(train_df.index).sample(frac=1, random_state=self.random_seed)
226
+
227
+ # load labels
228
+ with open(labels_path) as f:
229
+ jdata = json.load(f)
230
+
231
+ # make mapping from label_tec to tec_category_name
232
+ label_cname = self.make_label_category_name_dict(jdata)
233
+
234
+ # create instances from each dataset
235
+ instances_train = self.create_multiple_choice_instances(train_df, TRAIN_SPLIT, label_cname)
236
+ instances_test = self.create_multiple_choice_instances(test_df, TEST_SPLIT, label_cname)
237
+
238
+ # return all instances
239
+ all_instances = instances_train + instances_test
240
+ return all_instances
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List
3
3
 
4
4
  from helm.common.codec import from_jsonl
5
- from .scenario import Scenario, Instance
5
+ from helm.benchmark.scenarios.scenario import Scenario, Instance
6
6
 
7
7
 
8
8
  class CustomMCQAScenario(Scenario):
@@ -0,0 +1,130 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ Input,
12
+ Output,
13
+ )
14
+ from helm.common.general import ensure_directory_exists
15
+
16
+
17
+ class CzechBankQAScenario(Scenario):
18
+ INSTRUCTIONS = """Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.
19
+
20
+ Database schema:
21
+ CREATE TABLE "account" (
22
+ "account_id" integer NOT NULL DEFAULT '0'
23
+ , "district_id" integer NOT NULL DEFAULT '0'
24
+ , "frequency" varchar(18) NOT NULL
25
+ , "date" date NOT NULL
26
+ , PRIMARY KEY ("account_id")
27
+ , CONSTRAINT "account_ibfk_1" FOREIGN KEY ("district_id") REFERENCES "district" ("district_id")
28
+ );
29
+ CREATE TABLE "card" (
30
+ "card_id" integer NOT NULL DEFAULT '0'
31
+ , "disp_id" integer NOT NULL
32
+ , "type" varchar(7) NOT NULL
33
+ , "issued" date NOT NULL
34
+ , PRIMARY KEY ("card_id")
35
+ , CONSTRAINT "card_ibfk_1" FOREIGN KEY ("disp_id") REFERENCES "disp" ("disp_id")
36
+ );
37
+ CREATE TABLE "client" (
38
+ "client_id" integer NOT NULL
39
+ , "gender" varchar(1) NOT NULL
40
+ , "birth_date" date NOT NULL
41
+ , "district_id" integer NOT NULL
42
+ , PRIMARY KEY ("client_id")
43
+ , CONSTRAINT "client_ibfk_1" FOREIGN KEY ("district_id") REFERENCES "district" ("district_id")
44
+ );
45
+ CREATE TABLE "disp" (
46
+ "disp_id" integer NOT NULL
47
+ , "client_id" integer NOT NULL
48
+ , "account_id" integer NOT NULL
49
+ , "type" varchar(9) NOT NULL
50
+ , PRIMARY KEY ("disp_id")
51
+ , CONSTRAINT "disp_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
52
+ , CONSTRAINT "disp_ibfk_2" FOREIGN KEY ("client_id") REFERENCES "client" ("client_id")
53
+ );
54
+ CREATE TABLE "district" (
55
+ "district_id" integer NOT NULL DEFAULT '0'
56
+ , "A2" varchar(19) NOT NULL
57
+ , "A3" varchar(15) NOT NULL
58
+ , "A4" integer NOT NULL
59
+ , "A5" integer NOT NULL
60
+ , "A6" integer NOT NULL
61
+ , "A7" integer NOT NULL
62
+ , "A8" integer NOT NULL
63
+ , "A9" integer NOT NULL
64
+ , "A10" decimal(4,1) NOT NULL
65
+ , "A11" integer NOT NULL
66
+ , "A12" decimal(4,1) DEFAULT NULL
67
+ , "A13" decimal(3,2) NOT NULL
68
+ , "A14" integer NOT NULL
69
+ , "A15" integer DEFAULT NULL
70
+ , "A16" integer NOT NULL
71
+ , PRIMARY KEY ("district_id")
72
+ );
73
+ CREATE TABLE "loan" (
74
+ "loan_id" integer NOT NULL DEFAULT '0'
75
+ , "account_id" integer NOT NULL
76
+ , "date" date NOT NULL
77
+ , "amount" integer NOT NULL
78
+ , "duration" integer NOT NULL
79
+ , "payments" decimal(6,2) NOT NULL
80
+ , "status" varchar(1) NOT NULL
81
+ , PRIMARY KEY ("loan_id")
82
+ , CONSTRAINT "loan_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
83
+ );
84
+ CREATE TABLE "order" (
85
+ "order_id" integer NOT NULL DEFAULT '0'
86
+ , "account_id" integer NOT NULL
87
+ , "bank_to" varchar(2) NOT NULL
88
+ , "account_to" integer NOT NULL
89
+ , "amount" decimal(6,1) NOT NULL
90
+ , "k_symbol" varchar(8) NOT NULL
91
+ , PRIMARY KEY ("order_id")
92
+ , CONSTRAINT "order_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
93
+ );
94
+ CREATE TABLE "trans" (
95
+ "trans_id" integer NOT NULL DEFAULT '0'
96
+ , "account_id" integer NOT NULL DEFAULT '0'
97
+ , "date" date NOT NULL
98
+ , "type" varchar(6) NOT NULL
99
+ , "operation" varchar(14) DEFAULT NULL
100
+ , "amount" integer NOT NULL
101
+ , "balance" integer NOT NULL
102
+ , "k_symbol" varchar(11) DEFAULT NULL
103
+ , "bank" varchar(2) DEFAULT NULL
104
+ , "account" integer DEFAULT NULL
105
+ , PRIMARY KEY ("trans_id")
106
+ , CONSTRAINT "trans_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
107
+ );""" # noqa: E501
108
+
109
+ """CzechBankQA"""
110
+ name = "czech_bank_qa"
111
+ description = "This is a list of SQL queries for a text-to-SQL task over the Czech Bank 1999 dataset."
112
+ tags = ["text_to_sql"]
113
+
114
+ def __init__(self, config_name: str):
115
+ super().__init__()
116
+ self.config_name = config_name
117
+
118
+ def get_instances(self, output_path: str) -> List[Instance]:
119
+ cache_dir = os.path.join(output_path, "data")
120
+ ensure_directory_exists(cache_dir)
121
+ dataset = datasets.load_dataset(
122
+ "yifanmai/czech_bank_qa", name=self.config_name, split="test", cache_dir=cache_dir
123
+ )
124
+ instances: List[Instance] = []
125
+ for row in dataset:
126
+ input = Input(text=row["description"])
127
+ references = [Reference(output=Output(text=row["sql_query"]), tags=[CORRECT_TAG])]
128
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
129
+ instances.append(instance)
130
+ return instances
@@ -3,7 +3,7 @@ import json
3
3
  from typing import List
4
4
  from helm.common.general import ensure_file_downloaded
5
5
  from helm.common.general import ensure_directory_exists
6
- from .scenario import Scenario, Instance, Input, Reference, CORRECT_TAG, Output, VALID_SPLIT
6
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, CORRECT_TAG, Output, VALID_SPLIT
7
7
 
8
8
  TASKS = {
9
9
  "counterfactual": ["snli_premise", "snli_hypothesis"]
@@ -5,7 +5,7 @@ import random
5
5
  from tqdm import tqdm
6
6
  from typing import List, Dict, Optional
7
7
  from helm.common.general import ensure_file_downloaded
8
- from .scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
9
9
 
10
10
  option_keys = ["pii", "privacy_understanding", "enron_email_extraction"]
11
11
 
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  from typing import List, Dict
4
- from .scenario import Scenario, Instance, Input, TEST_SPLIT
4
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
5
5
  from helm.common.general import ensure_file_downloaded
6
6
  from helm.benchmark.scenarios.scenario import Reference, Output
7
7
 
@@ -4,7 +4,7 @@ import random
4
4
  from typing import List, Dict
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import Scenario, Instance, TEST_SPLIT, Input
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
8
 
9
9
 
10
10
  DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
@@ -6,7 +6,18 @@ from typing import Dict, List, Optional
6
6
 
7
7
  from helm.common.general import ensure_file_downloaded
8
8
  from helm.common.hierarchical_logger import hlog
9
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
9
+ from helm.benchmark.runner import get_benchmark_output_path
10
+ from helm.benchmark.scenarios.scenario import (
11
+ Scenario,
12
+ Instance,
13
+ Reference,
14
+ TRAIN_SPLIT,
15
+ VALID_SPLIT,
16
+ TEST_SPLIT,
17
+ CORRECT_TAG,
18
+ Input,
19
+ Output,
20
+ )
10
21
 
11
22
 
12
23
  @dataclass(frozen=True)
@@ -138,5 +149,5 @@ class EmpatheticDialoguesScenario(Scenario):
138
149
 
139
150
  if __name__ == "__main__":
140
151
  scenario = EmpatheticDialoguesScenario()
141
- instances = scenario.get_instances("./benchmark_output/scenarios/empatheticdialogues")
152
+ instances = scenario.get_instances(os.path.join(get_benchmark_output_path(), "scenarios/empatheticdialogues"))
142
153
  print(instances[100])
@@ -0,0 +1,157 @@
1
+ from typing import List
2
+ from helm.common.general import ensure_directory_exists
3
+ from helm.benchmark.scenarios.scenario import (
4
+ Input,
5
+ Scenario,
6
+ Instance,
7
+ TEST_SPLIT,
8
+ CORRECT_TAG,
9
+ Reference,
10
+ Output,
11
+ )
12
+ import pandas as pd
13
+
14
+
15
+ def file_preprocessing(data_path: str, task_objective: str) -> pd.DataFrame:
16
+ """
17
+ Preprocess the data files to create a DataFrame with the necessary columns.
18
+ task_objective: 'brief_hospital_course' or 'discharge_instructions'
19
+ Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \
20
+ --ask-password https://physionet.org/files/discharge-me/1.3/
21
+ data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
22
+ """
23
+ # Load the first CSV file
24
+ df_diagnosis = pd.read_csv(
25
+ f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz", compression="gzip", keep_default_na=False
26
+ )
27
+ df_discharge = pd.read_csv(
28
+ f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz", compression="gzip", keep_default_na=False
29
+ )
30
+ df_target = pd.read_csv(
31
+ f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz",
32
+ compression="gzip",
33
+ keep_default_na=False,
34
+ )
35
+ df_radiology = pd.read_csv(
36
+ f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz", compression="gzip", keep_default_na=False
37
+ )
38
+ df_ed = pd.read_csv(
39
+ f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz", compression="gzip", keep_default_na=False
40
+ )
41
+ df_triage = pd.read_csv(
42
+ f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz", compression="gzip", keep_default_na=False
43
+ )
44
+ df_diagnosis_triage = pd.merge(
45
+ df_diagnosis, df_triage, on="subject_id", how="inner", suffixes=("_df_diagnosis", "_df_triage")
46
+ )
47
+ df_diagnosis_triage_discharge = pd.merge(
48
+ df_diagnosis_triage, df_discharge, on="subject_id", how="inner", suffixes=("", "_df_discharge")
49
+ )
50
+ df_diagnosis_triage_discharge_radiology = pd.merge(
51
+ df_diagnosis_triage_discharge, df_radiology, on="hadm_id", how="inner", suffixes=("", "_df_radiology")
52
+ )
53
+
54
+ df_features = pd.merge(
55
+ df_diagnosis_triage_discharge_radiology, df_ed, on="hadm_id", how="inner", suffixes=("", "_df_ed")
56
+ )
57
+
58
+ # Reduce the DataFrame to remove duplicate hadm_id
59
+ df_features_reduced = df_features.drop_duplicates(subset="hadm_id")
60
+ columns_to_keep = ["text", "text_df_radiology", "hadm_id"]
61
+ df_input = df_features_reduced[columns_to_keep]
62
+ final_df = pd.merge(df_input, df_target, on="hadm_id", how="inner")
63
+
64
+ def remove_substring(string, substring):
65
+ return string.replace(substring, "")
66
+
67
+ final_df["text"] = final_df.apply(lambda row: remove_substring(row["text"], row[task_objective]), axis=1)
68
+ return final_df
69
+
70
+
71
+ def create_prompt(text: str, text_df_radiology: str, task_objective: str) -> str:
72
+ """
73
+ Create the prompt for the instance.
74
+ """
75
+ prompt = f"Generate the {task_objective} from the following patient discharge text and radiology report text.\
76
+ \n\nDischarge Text:\n{text}\n\nRadiology Report:\n{text_df_radiology}\n\n{task_objective}:\n"
77
+ return prompt
78
+
79
+
80
+ class DischargeMeScenario(Scenario):
81
+ """
82
+ DischargeMe is a discharge instruction generation dataset and brief hospital course generation \
83
+ dataset collected from MIMIC-IV data.
84
+ In this scenario, we only consider the discharge text as well as the radiology report text.
85
+ We are using the phase I test set which is composed of 14,702 hospital admission instances.
86
+
87
+ The splits are provided by the dataset itself.
88
+
89
+ TASKS = {discharge instruction, brief hospital course}
90
+ Sample Synthetic Prompt:
91
+ Generate the {TASK} from the following patient discharge text and radiology report text.
92
+
93
+ Discharge Text:
94
+ Name: {Patient Name} Unit No: {Unit Number} Date of Birth: {DOB} Date of Admission:
95
+ {DOA} Date of Discharge: {DOD}
96
+ Chief Complaint: {Chief Complaint} History of Present Illness: {HPI} Past Medical History: {PMH}
97
+ Medications on Admission: {Medications} Allergies: {Allergies} Physical Exam: {Physical Exam}
98
+ Discharge Diagnosis: {Discharge Diagnosis}
99
+
100
+ Radiology Report:
101
+ {Radiology Report}
102
+
103
+ {TASK}:
104
+ @inproceedings{Xu_2024,
105
+ title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.},
106
+ url={https://doi.org/10.13026/4a0k-4360},
107
+ DOI={10.13026/27pt-1259},
108
+ booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024},
109
+ publisher={Association for Computational Linguistics},
110
+ author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis},
111
+ year={2024}
112
+ }
113
+ """
114
+
115
+ name = "dischargeme"
116
+ description = "DischargeMe is a discharge instruction generation dataset and brief hospital course generation \
117
+ dataset collected from MIMIC-IV data, consindering only the discharge text as well as the radiology report text."
118
+ tags = ["biomedical"]
119
+
120
+ def get_instances(self, output_path: str) -> List[Instance]:
121
+ data_path = "/share/pi/nigam/data/physionet.org"
122
+ ensure_directory_exists(data_path)
123
+ instances: List[Instance] = []
124
+ df_bhc = file_preprocessing(data_path, "brief_hospital_course")
125
+ df_di = file_preprocessing(data_path, "discharge_instructions")
126
+
127
+ for i in range(df_bhc.shape[0]):
128
+ prompt_bhc = create_prompt(
129
+ df_bhc.iloc[i]["text"], df_bhc.iloc[i]["text_df_radiology"], "Brief Hospital Course"
130
+ )
131
+ prompt_di = create_prompt(
132
+ df_di.iloc[i]["text"], df_di.iloc[i]["text_df_radiology"], "Discharge Instructions"
133
+ )
134
+ answer_bhc = df_bhc.iloc[i]["brief_hospital_course"]
135
+ answer_di = df_di.iloc[i]["discharge_instructions"]
136
+ instances.append(
137
+ Instance(
138
+ input=Input(text=prompt_bhc),
139
+ references=[Reference(Output(text=answer_bhc), tags=[CORRECT_TAG])],
140
+ split=TEST_SPLIT,
141
+ )
142
+ )
143
+ instances.append(
144
+ Instance(
145
+ input=Input(text=prompt_di),
146
+ references=[Reference(Output(text=answer_di), tags=[CORRECT_TAG])],
147
+ split=TEST_SPLIT,
148
+ )
149
+ )
150
+
151
+ return instances
152
+
153
+ def read_file(self, file_path: str) -> List[str]:
154
+ with open(file_path, "r") as file:
155
+ lines = file.readlines()
156
+ lines = [line.strip() for line in lines]
157
+ return lines
@@ -3,7 +3,16 @@ import os
3
3
  from typing import List, Dict, Optional
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, CORRECT_TAG, Input, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TRAIN_SPLIT,
11
+ VALID_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
7
16
 
8
17
  REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
9
18
  WEDGING_DATA_URL = "https://drive.google.com/uc?export=download&id=1kWB3_F4Tobc_oVGC_T-a5DHEh-AB4GTc"
@@ -2,7 +2,16 @@ import numpy as np
2
2
  import random
3
3
  from typing import List, Tuple
4
4
 
5
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
6
15
 
7
16
 
8
17
  class DyckLanguageScenario(Scenario):