crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,141 @@
1
+ import os
2
+ import requests
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Input,
11
+ Output,
12
+ )
13
+ from helm.common.general import ensure_directory_exists
14
+
15
+
16
+ class MTSamplesProceduresScenario(Scenario):
17
+ """
18
+ Processes the MTSamples Procedure dataset, a subset of MTSamples,
19
+ specifically focusing on procedure-related medical notes.
20
+ This dataset contains transcribed medical reports detailing various procedures,
21
+ treatments, and surgical interventions.
22
+
23
+ - Extracts `PLAN`, `SUMMARY`, or `FINDINGS` sections as references.
24
+ - Ensures these sections are excluded from the input text.
25
+ - Filters out files that do not contain any of the three reference sections.
26
+
27
+ Data source: https://github.com/raulista1997/benchmarkdata/tree/main/mtsample_procedure
28
+ """
29
+
30
+ GIT_HASH = "c4c252443fa9c52afb6960f53e51be278639bea2"
31
+ GITHUB_DIR_URL = f"https://github.com/raulista1997/benchmarkdata/tree/{GIT_HASH}/mtsample_procedure"
32
+ RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsample_procedure/"
33
+
34
+ name = "mtsamples"
35
+ description = (
36
+ "A dataset that provides a patient note regarding an operation, with the objective to document the procedure."
37
+ )
38
+ tags = ["medical", "transcription", "plan_generation"]
39
+
40
+ def fetch_file_list(self) -> List[str]:
41
+ """
42
+ Uses the GitHub API to fetch the list of `.txt` files in the dataset directory.
43
+ """
44
+ api_url = "https://api.github.com/repos/raulista1997/benchmarkdata/contents/mtsample_procedure"
45
+ headers = {"Accept": "application/vnd.github+json"}
46
+
47
+ response = requests.get(api_url, headers=headers)
48
+ if response.status_code != 200:
49
+ raise Exception(f"Failed to fetch file list from GitHub API ({api_url})")
50
+
51
+ files = response.json()
52
+ return [file["name"] for file in files if file["name"].endswith(".txt")]
53
+
54
+ def download_file(self, file_name: str, output_dir: str) -> str:
55
+ """
56
+ Downloads a text file from GitHub and saves it locally.
57
+ """
58
+ file_url = self.RAW_BASE_URL + file_name
59
+ file_path = os.path.join(output_dir, file_name)
60
+
61
+ if not os.path.exists(file_path): # Avoid redundant downloads
62
+ response = requests.get(file_url)
63
+ if response.status_code != 200:
64
+ raise Exception(f"Failed to download {file_url}")
65
+
66
+ with open(file_path, "w", encoding="utf-8") as f:
67
+ f.write(response.text)
68
+
69
+ return file_path
70
+
71
+ def extract_sections(self, text: str) -> tuple:
72
+ """
73
+ Extracts `PLAN`, `SUMMARY`, and `FINDINGS` sections from the text.
74
+ Returns (plan, summary, findings) as a tuple, ensuring uppercase detection.
75
+ """
76
+ plan, summary, findings = None, None, None
77
+ text_upper = text.upper()
78
+
79
+ if "PLAN:" in text_upper:
80
+ plan = text.split("PLAN:")[1].split("\n", 1)[0].strip()
81
+
82
+ if "SUMMARY:" in text_upper:
83
+ summary = text.split("SUMMARY:")[1].split("\n", 1)[0].strip()
84
+
85
+ if "FINDINGS:" in text_upper:
86
+ findings = text.split("FINDINGS:")[1].split("\n", 1)[0].strip()
87
+
88
+ return plan, summary, findings
89
+
90
+ def remove_sections(self, text: str) -> str:
91
+ """
92
+ Removes `PLAN`, `SUMMARY`, and `FINDINGS` sections from the input text.
93
+ """
94
+ sections = ["PLAN:", "SUMMARY:", "FINDINGS:"]
95
+ for section in sections:
96
+ if section in text:
97
+ text = text.split(section)[0].strip() # Keep content before the section
98
+ return text
99
+
100
+ def get_instances(self, output_path: str) -> List[Instance]:
101
+ """
102
+ Downloads, processes, and converts MTSamples data into HELM format.
103
+ """
104
+ ensure_directory_exists(output_path)
105
+
106
+ # Fetch list of available files from GitHub
107
+ file_list = self.fetch_file_list()
108
+
109
+ instances = []
110
+ for file_name in file_list:
111
+ try:
112
+ # Download the text file
113
+ file_path = self.download_file(file_name, output_path)
114
+
115
+ # Read content
116
+ with open(file_path, "r", encoding="utf-8") as f:
117
+ text_content = f.read().strip()
118
+
119
+ # Extract structured sections
120
+ plan, summary, findings = self.extract_sections(text_content)
121
+
122
+ # Use plan > summary > findings as reference text
123
+ reference_text = plan or summary or findings
124
+ if not reference_text:
125
+ continue # Ignore notes with no reference section
126
+
127
+ # Remove structured sections from input
128
+ cleaned_text = self.remove_sections(text_content)
129
+
130
+ # Create HELM instance
131
+ instances.append(
132
+ Instance(
133
+ input=Input(text=cleaned_text), # Processed text without sections
134
+ references=[Reference(Output(text=reference_text), tags=[CORRECT_TAG])],
135
+ split=TEST_SPLIT,
136
+ )
137
+ )
138
+ except Exception as e:
139
+ print(f"Error processing {file_name}: {e}")
140
+
141
+ return instances
@@ -0,0 +1,141 @@
1
+ import os
2
+ import requests
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Input,
11
+ Output,
12
+ )
13
+ from helm.common.general import ensure_directory_exists
14
+
15
+
16
+ class MTSamplesReplicateScenario(Scenario):
17
+ """
18
+ MTSamples.com is designed to give you access to a big collection of transcribed medical reports.
19
+ These samples can be used by learning, as well as working medical transcriptionists for their daily
20
+ transcription needs. We present the model with patient information and request it to generate a corresponding
21
+ treatment plan.
22
+
23
+ Sample Synthetic Prompt:
24
+ Given various information about a patient, return a reasonable treatment plan for the patient.
25
+
26
+ - Extracts `PLAN`, `SUMMARY`, or `FINDINGS` as the reference (PLAN preferred).
27
+ - Removes `PLAN` from the input text but keeps other sections.
28
+ - Ignores files that do not contain any of these reference sections.
29
+ """
30
+
31
+ GIT_HASH = "ebc104a4f96c5b7602242f301e081e9934a23344"
32
+ API_BASE_URL = (
33
+ f"https://api.github.com/repos/raulista1997/benchmarkdata/contents/mtsamples_processed?ref={GIT_HASH}"
34
+ )
35
+ RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsamples_processed/"
36
+
37
+ name = "mtsamples_replicate"
38
+ description = (
39
+ "A dataset of clinical notes where the model is prompted to generate "
40
+ "a reasonable treatment plan for the patient based on transcribed medical reports."
41
+ )
42
+ tags = ["medical", "transcription", "plan_generation"]
43
+
44
+ def fetch_file_list(self) -> List[str]:
45
+ """
46
+ Uses the GitHub API to fetch the list of `.txt` files at a specific commit.
47
+ """
48
+ response = requests.get(self.API_BASE_URL)
49
+ if response.status_code != 200:
50
+ raise Exception(f"Failed to fetch file list from GitHub API: {response.text}")
51
+
52
+ files = response.json()
53
+ return [f["name"] for f in files if f["name"].endswith(".txt")]
54
+
55
+ def download_file(self, file_name: str, output_dir: str) -> str:
56
+ """
57
+ Downloads a text file from GitHub and saves it locally.
58
+ """
59
+ file_url = self.RAW_BASE_URL + file_name
60
+ file_path = os.path.join(output_dir, file_name)
61
+
62
+ if not os.path.exists(file_path):
63
+ response = requests.get(file_url)
64
+ if response.status_code != 200:
65
+ raise Exception(f"Failed to download {file_url}")
66
+ with open(file_path, "w", encoding="utf-8") as f:
67
+ f.write(response.text)
68
+
69
+ return file_path
70
+
71
+ def extract_sections(self, text: str) -> tuple:
72
+ """
73
+ Extracts `PLAN`, `SUMMARY`, and `FINDINGS` sections from the text.
74
+ Returns (plan, summary, findings) as a tuple, ensuring uppercase detection.
75
+ """
76
+ plan, summary, findings = None, None, None
77
+ text_upper = text.upper()
78
+
79
+ if "PLAN:" in text_upper:
80
+ plan = text.split("PLAN:")[1].split("\n", 1)[0].strip()
81
+
82
+ if "SUMMARY:" in text_upper:
83
+ summary = text.split("SUMMARY:")[1].split("\n", 1)[0].strip()
84
+
85
+ if "FINDINGS:" in text_upper:
86
+ findings = text.split("FINDINGS:")[1].split("\n", 1)[0].strip()
87
+
88
+ return plan, summary, findings
89
+
90
+ def remove_plan_section(self, text: str) -> str:
91
+ """
92
+ Removes `PLAN:` section from the input text while keeping the rest.
93
+ """
94
+ sections = ["PLAN:"]
95
+ for section in sections:
96
+ if section in text:
97
+ text = text.split(section)[0].strip() # Keep content before PLAN
98
+ return text
99
+
100
+ def get_instances(self, output_path: str) -> List[Instance]:
101
+ """
102
+ Downloads, processes, and converts MTSamples data into HELM format.
103
+ """
104
+ ensure_directory_exists(output_path)
105
+
106
+ # Fetch list of available files from GitHub
107
+ file_list = self.fetch_file_list()
108
+
109
+ instances = []
110
+ for file_name in file_list:
111
+ try:
112
+ # Download the text file
113
+ file_path = self.download_file(file_name, output_path)
114
+
115
+ # Read content
116
+ with open(file_path, "r", encoding="utf-8") as f:
117
+ text_content = f.read().strip()
118
+
119
+ # Extract structured sections
120
+ plan, summary, findings = self.extract_sections(text_content)
121
+
122
+ # Use plan > summary > findings as reference text
123
+ reference_text = plan or summary or findings
124
+ if not reference_text:
125
+ continue # Ignore notes with no reference section
126
+
127
+ # Remove PLAN section from input
128
+ cleaned_text = self.remove_plan_section(text_content)
129
+
130
+ # Create HELM instance
131
+ instances.append(
132
+ Instance(
133
+ input=Input(text=cleaned_text), # Processed text without PLAN
134
+ references=[Reference(Output(text=reference_text), tags=[CORRECT_TAG])],
135
+ split=TEST_SPLIT,
136
+ )
137
+ )
138
+ except Exception as e:
139
+ print(f"Error processing {file_name}: {e}")
140
+
141
+ return instances
@@ -0,0 +1,271 @@
1
+ import os
2
+ import re
3
+
4
+ from typing import Any, Dict, List, Tuple, Optional
5
+ import xml.etree.ElementTree as ET
6
+
7
+ from helm.common.general import ensure_directory_exists
8
+ from helm.benchmark.scenarios.scenario import (
9
+ TEST_SPLIT,
10
+ TRAIN_SPLIT,
11
+ Input,
12
+ Scenario,
13
+ Instance,
14
+ CORRECT_TAG,
15
+ Reference,
16
+ Output,
17
+ )
18
+
19
+ ORIGINAL_DEFINITIONS = {
20
+ "ABDOMINAL": "History of intra-abdominal surgery, small or large intestine resection, or small bowel obstruction",
21
+ "ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define “advanced” \
22
+ as having 2 or more of the following: • Taking 2 or more medications to treat CAD • \
23
+ History of myocardial infarction (MI) • Currently experiencing angina • Ischemia, past or present",
24
+ "ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
25
+ "ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)",
26
+ "CREATININE": "Serum creatinine level above the upper normal limit",
27
+ "DIETSUPP-2MOS": "Taken a dietary supplement (excluding vitamin D) in the past 2 months",
28
+ "DRUG-ABUSE": "Current or past history of drug abuse",
29
+ "ENGLISH": "Patient must speak English",
30
+ "HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
31
+ "KETO-1YR": "Diagnosis of ketoacidosis within the past year",
32
+ "MAJOR-DIABETES": "Major diabetes-related complication. For the purposes of this annotation, we define \
33
+ “major complication” (as opposed to “minor complication”) as any of the following that are a result of \
34
+ (or strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • \
35
+ Retinopathy • nephropathy • neuropathy",
36
+ "MAKES-DECISIONS": "Patient must make their own medical decisions",
37
+ "MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
38
+ }
39
+ # Custom definitions for better prompts
40
+ LONG_DEFINITIONS = {
41
+ "ABDOMINAL": "History of intra-abdominal surgery. This could include any form of intra-abdominal surgery, \
42
+ including but not limited to small/large intestine resection or small bowel obstruction",
43
+ "ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define \
44
+ “advanced” as having 2 or more of the following: (a) Taking 2 or more medications to treat CAD (b) History \
45
+ of myocardial infarction (MI) (c) Currently experiencing angina (d) Ischemia, past or present. \
46
+ The patient must have at least 2 of these categories (a,b,c,d) to meet this criterion, otherwise the patient \
47
+ does not meet this criterion. For ADVANCED-CAD, be strict in your evaluation of the patient -- if they just \
48
+ have cardiovascular disease, then they do not meet this criterion.",
49
+ "ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
50
+ "ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)..",
51
+ "CREATININE": "Serum creatinine level above the upper normal limit",
52
+ "DIETSUPP-2MOS": "Consumption of a dietary supplement (excluding vitamin D) in the past 2 months. To assess \
53
+ this criterion, go through the list of medications_and_supplements taken from the note. If a substance could \
54
+ potentially be used as a dietary supplement (i.e. it is commonly used as a dietary supplement, even if it \
55
+ is not explicitly stated as being used as a dietary supplement), then the patient meets this criterion. \
56
+ Be lenient and broad in what is considered a dietary supplement. For example, a 'multivitamin' and \
57
+ 'calcium carbonate' should always be considered a dietary supplement if they are included in this list.",
58
+ "DRUG-ABUSE": "Current or past history of drug abuse",
59
+ "ENGLISH": "Patient speaks English. Assume that the patient speaks English, unless otherwise explicitly noted. \
60
+ If the patient's language is not mentioned in the note, then assume they speak English and thus meet \
61
+ this criteria.",
62
+ "HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
63
+ "KETO-1YR": "Diagnosis of ketoacidosis within the past year",
64
+ "MAJOR-DIABETES": "Major diabetes-related complication. Examples of “major complication” (as opposed to \
65
+ “minor complication”) include, but are not limited to, any of the following that are a result of (or \
66
+ strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • Retinopathy \
67
+ • nephropathy • neuropathy. Additionally, if multiple conditions together imply a severe case of diabetes, \
68
+ then count that as a major complication.",
69
+ "MAKES-DECISIONS": "Patient must make their own medical decisions. Assume that the patient makes their own \
70
+ medical decisions, unless otherwise explicitly noted. There is no information provided about the \
71
+ patient's ability to make their own medical decisions, then assume they do make their own decisions and \
72
+ therefore meet this criteria.\"",
73
+ "MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
74
+ }
75
+
76
+
77
+ class XMLDataLoader:
78
+ def __init__(
79
+ self, path_to_folder: str, is_convert_to_numbers=True, is_split_text=True, is_remove_excessive_new_lines=True
80
+ ):
81
+ self.path_to_folder = path_to_folder
82
+ self.is_convert_to_numbers = is_convert_to_numbers
83
+ self.is_split_text = is_split_text
84
+ self.is_remove_excessive_new_lines = is_remove_excessive_new_lines
85
+
86
+ def load_data(self) -> List[Dict[str, Any]]:
87
+ """Main function: Data loader for the XML files"""
88
+ data = []
89
+ file_names = os.listdir(self.path_to_folder)
90
+ file_names = sorted([file for file in file_names if file.endswith(".xml")])
91
+ for file_name in file_names:
92
+ file_path = os.path.join(self.path_to_folder, file_name)
93
+ text, labels = self.parse_xml(file_path)
94
+ data.append({"patient_id": file_name.replace(".xml", ""), "ehr": text, "labels": labels})
95
+
96
+ return data
97
+
98
+ @staticmethod
99
+ def get_date_of_note(patient: Dict[str, Any], note_idx: int) -> Optional[str]:
100
+ """Get date of note for patient"""
101
+ assert note_idx <= len(patient["ehr"]), f"{note_idx} out of bounds for {patient['patient_id']}"
102
+ note: str = patient["ehr"][note_idx]
103
+ match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
104
+ date = match.group(1) if match else None
105
+ if not date:
106
+ print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
107
+ return date
108
+
109
+ @staticmethod
110
+ def get_current_date_for_patient(patient: Dict[str, Any]) -> Optional[str]:
111
+ """Get most recent date visible in files for a given patient"""
112
+ most_recent_date = None
113
+ for note in patient["ehr"]:
114
+ match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
115
+ most_recent_date = match.group(1) if match else most_recent_date
116
+ if not most_recent_date:
117
+ print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
118
+ return most_recent_date
119
+
120
+ def parse_xml(self, XML_file) -> Tuple[List[str], Dict[str, str]]:
121
+ tree = ET.parse(XML_file)
122
+ root = tree.getroot()
123
+ text_content = ""
124
+ result_text: List[str] = []
125
+ tags = {}
126
+ for elem in root.iter():
127
+ if elem.tag == "TEXT":
128
+ text_content = elem.text if elem.text else ""
129
+ if self.is_remove_excessive_new_lines:
130
+ text_content = self.remove_excessive_newlines(text_content)
131
+ if self.is_split_text:
132
+ result_text = self.split_text(text_content)
133
+ else:
134
+ result_text = [text_content]
135
+ elif elem.tag == "TAGS":
136
+ tags = self.read_tags(root)
137
+ return (result_text, tags)
138
+
139
+ def read_tags(self, root) -> Dict[str, str]:
140
+ """Reads the tags from an XML file and returns a dictionary of tags"""
141
+ tags_dict = {}
142
+ for tag in root.iter("TAGS"):
143
+ for subtag in tag:
144
+ met_value = subtag.attrib.get("met")
145
+ if self.is_convert_to_numbers:
146
+ met_value = 1 if met_value == "met" else 0
147
+ tags_dict[subtag.tag] = met_value
148
+ return tags_dict
149
+
150
+ def split_text(self, text: str) -> List[str]:
151
+ split_char = "*" * 100
152
+ parts = [x.strip() for x in text.split(split_char) if x.strip() != ""]
153
+ return parts
154
+
155
+ def remove_excessive_newlines(self, text: str) -> str:
156
+ text = text.replace("\n\n\n", "\n")
157
+ return text
158
+
159
+
160
+ class N2C2CTMatchingScenario(Scenario):
161
+ """
162
+ From "Cohort selection for clinical trials: n2c2 2018 shared task track 1" (Stubbs et al. 2019).
163
+ N2C2 is a collection of 288 patients (202 train / 86 test), each with 2-5 deidentified real-world clinical notes.
164
+ We use the prompt LLM formulation from Wornow et al. (2024).
165
+
166
+ Citation
167
+ ```
168
+ @article{stubbs2019cohort,
169
+ title={Cohort selection for clinical trials: n2c2 2018 shared task track 1},
170
+ author={Stubbs, Amber and Filannino, Michele and Soysal, Ergin and Henry, Samuel and Uzuner, {\"O}zlem},
171
+ journal={Journal of the American Medical Informatics Association},
172
+ volume={26},
173
+ number={11},
174
+ pages={1163--1171},
175
+ year={2019},
176
+ publisher={Oxford University Press}
177
+ }
178
+ @article{wornow2024zero,
179
+ title={Zero-shot clinical trial patient matching with llms},
180
+ author={Wornow, Michael and Lozano, Alejandro and Dash, Dev and Jindal, Jenelle and Mahaffey, \
181
+ Kenneth W and Shah, Nigam H},
182
+ journal={NEJM AI},
183
+ pages={AIcs2400360},
184
+ year={2024},
185
+ publisher={Massachusetts Medical Society}
186
+ }
187
+ ```
188
+ """
189
+
190
+ name = "n2c2_ct_matching"
191
+ description = "A dataset that provides clinical notes and asks the model to classify whether the \
192
+ patient is a valid candidate for a provided clinical trial."
193
+ tags = [] # TODO
194
+
195
+ POSSIBLE_ANSWER_CHOICES: List[str] = [
196
+ "yes",
197
+ "no",
198
+ ]
199
+
200
+ def __init__(self, subject: str):
201
+ super().__init__()
202
+ self.subject: str = subject # specific inclusion criterion to assess
203
+ self.path_to_train_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/train/"
204
+ self.path_to_test_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/test/"
205
+
206
+ def create_prompt(self, patient: Dict[str, Any]) -> str:
207
+ # Cast None values to empty strings during string formatting, but keep the original functions returning None
208
+ notes_list = [
209
+ f"## Note #{i+1}\nDate: {XMLDataLoader.get_date_of_note(patient, i) or ''}\n{note}"
210
+ for i, note in enumerate(patient["ehr"])
211
+ ]
212
+ notes: str = ("\n" + "*" * 50 + "\n\n").join(notes_list)
213
+ current_date = XMLDataLoader.get_current_date_for_patient(patient)
214
+ prompt = f"""
215
+ # Task
216
+ Your job is to decide whether the given patient meets the inclusion criterion for a clinical trial.
217
+
218
+ # Inclusion Criterion
219
+ The inclusion criterion being assessed is: "{self.subject}".
220
+ The definition of the inclusion criterion is: "{LONG_DEFINITIONS[self.subject]}".
221
+
222
+ # Patient Clinical Notes
223
+ Below is a set of {len(patient['ehr'])} clinical notes describing the patient's current health status. \
224
+ Each note is separated by a header with the date that the note was written, as well as a long list of asterisks.
225
+
226
+ {'-' * 100}
227
+
228
+ {notes}
229
+
230
+ {'-' * 100}
231
+
232
+ # Current Date
233
+ Assume that the current date is: {current_date}
234
+
235
+ # Question
236
+ Does the patient meet the inclusion criterion "{self.subject}"?
237
+ """
238
+ return prompt
239
+
240
+ def get_instances(self, output_path: str) -> List[Instance]:
241
+ instances: List[Instance] = []
242
+ for split in ["train", "test"]:
243
+ # limit to zero shot setting
244
+ if split == "test":
245
+ path_to_data = self.path_to_train_dir if split == "train" else self.path_to_test_dir
246
+ ensure_directory_exists(path_to_data)
247
+
248
+ # Load dataset
249
+ dataloader = XMLDataLoader(path_to_data)
250
+ dataset = dataloader.load_data()
251
+
252
+ # Create instances
253
+ for patient in dataset:
254
+ is_met: bool = patient["labels"][self.subject]
255
+ correct_answer: str = "yes" if is_met else "no"
256
+
257
+ # Build `References. The possible answer choices are "yes" or "no"
258
+ references: List[Reference] = [
259
+ Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
260
+ for answer in N2C2CTMatchingScenario.POSSIBLE_ANSWER_CHOICES
261
+ ]
262
+
263
+ instances.append(
264
+ Instance(
265
+ input=Input(text=self.create_prompt(patient)),
266
+ references=references,
267
+ split=TRAIN_SPLIT if split == "train" else TEST_SPLIT,
268
+ )
269
+ )
270
+
271
+ return instances
@@ -4,7 +4,7 @@ import csv
4
4
  from typing import List, Dict
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,
@@ -10,7 +10,7 @@ from helm.common.hierarchical_logger import htrack_block, hlog
10
10
  from typing import List, Dict
11
11
 
12
12
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists, asdict_without_nones
13
- from .scenario import (
13
+ from helm.benchmark.scenarios.scenario import (
14
14
  Scenario,
15
15
  Instance,
16
16
  Reference,
@@ -3,7 +3,7 @@ import json
3
3
  import random
4
4
  from typing import Dict, List, Tuple
5
5
 
6
- from .scenario import (
6
+ from helm.benchmark.scenarios.scenario import (
7
7
  Scenario,
8
8
  Instance,
9
9
  Reference,
@@ -15,7 +15,16 @@ from helm.benchmark.window_services.tokenizer_service import TokenizerService
15
15
  from helm.common.authentication import Authentication
16
16
  from helm.common.optional_dependencies import handle_module_not_found_error
17
17
  from helm.proxy.services.server_service import ServerService
18
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
18
+ from helm.benchmark.scenarios.scenario import (
19
+ Scenario,
20
+ Instance,
21
+ Reference,
22
+ TRAIN_SPLIT,
23
+ TEST_SPLIT,
24
+ CORRECT_TAG,
25
+ Input,
26
+ Output,
27
+ )
19
28
 
20
29
  try:
21
30
  import sympy
@@ -0,0 +1,57 @@
1
+ from typing import List, Any
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ CORRECT_TAG,
10
+ TEST_SPLIT,
11
+ Input,
12
+ Output,
13
+ )
14
+
15
+
16
+ class OABExamsScenario(Scenario):
17
+ """
18
+ The OAB Exam is a mandatory test for anyone who wants to practice law in Brazil. The exam is composed for
19
+ an objective test with 80 multiple-choice questions covering all areas of Law and a written phase focused
20
+ on a specific legal area (e.g., Civil, Criminal, Labor Law), where candidates must draft a legal document
21
+ and answer four essay questions.
22
+
23
+ This dataset is composed by the exams that occured between 2010 and 2018.
24
+
25
+ The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/oab_exams
26
+ """
27
+
28
+ name = "oab_exams"
29
+ description = "OAB exams dataset"
30
+ tags = ["knowledge", "multiple_choice", "pt-br"]
31
+
32
+ def get_instances(self, output_path: str) -> List[Instance]:
33
+ # Download the raw data and read all the dialogues
34
+ dataset: Any
35
+ # Read all the instances
36
+ instances: List[Instance] = []
37
+ cache_dir = str(Path(output_path) / "data")
38
+
39
+ dataset = load_dataset("eduagarcia/oab_exams", cache_dir=cache_dir)
40
+ for example in dataset["train"]:
41
+ question = example["question"]
42
+ choices = example["choices"]
43
+ answer = example["answerKey"]
44
+ # Skipping every canceled question!
45
+ if example["nullified"]:
46
+ continue
47
+ answers_dict = dict(zip(choices["label"], choices["text"]))
48
+ correct_answer = answers_dict[answer]
49
+
50
+ def answer_to_reference(answer: str) -> Reference:
51
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
52
+
53
+ instance = Instance(
54
+ input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
55
+ )
56
+ instances.append(instance)
57
+ return instances