crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -133,7 +133,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
133
133
  input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
134
134
  # Calculate the number of tokens in the example
135
135
  total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
136
- print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
136
+ # print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
137
137
  if total_tokens + tokens_to_generate > max_seq_length:
138
138
  num_docs -= incremental
139
139
  break
@@ -142,7 +142,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
142
142
  if num_docs > len(docs):
143
143
  num_docs = len(docs)
144
144
  break
145
- print('Number of documents:', num_docs)
145
+ # print('Number of documents:', num_docs)
146
146
 
147
147
  # Generate samples
148
148
  for index in tqdm(range(num_samples)):
@@ -72,7 +72,7 @@ Question: {query} Answer:""" # noqa: E501
72
72
 
73
73
  class RULERHotpotQAScenario(_RULERQAScenario):
74
74
  name = "ruler_hotpotqa"
75
- description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
75
+ description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario." # noqa: E501
76
76
  tags = ["long_context", "rag"]
77
77
 
78
78
  def __init__(self, max_num_words: int):
@@ -81,7 +81,7 @@ class RULERHotpotQAScenario(_RULERQAScenario):
81
81
 
82
82
  class RULERSQuADScenario(_RULERQAScenario):
83
83
  name = "ruler_squad"
84
- description = "The SQuAD question answering scenario from RULER"
84
+ description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario." # noqa: E501
85
85
  tags = ["long_context", "rag"]
86
86
 
87
87
  def __init__(self, max_num_words: int):
@@ -1750,7 +1750,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1750
1750
  text_noun = self.prompt_components["text_noun"]
1751
1751
  instruction = self.prompt_components["single_instruction"]
1752
1752
 
1753
- passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1753
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
1754
1754
  question=question.format(row["question_translated"]),
1755
1755
  text_noun=text_noun,
1756
1756
  text=row["text"],
@@ -1898,7 +1898,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1898
1898
  text_noun = self.prompt_components["text_noun"]
1899
1899
  instruction = self.prompt_components["single_instruction"]
1900
1900
 
1901
- passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1901
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
1902
1902
  question=question.format(row["question_translated"]),
1903
1903
  text_noun=text_noun,
1904
1904
  text=row["text"],
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -23,13 +24,19 @@ class SHCBMTMedScenario(Scenario):
23
24
 
24
25
  name = "shc_bmt_med"
25
26
  description = (
26
- "A dataset containing patient notes with associated "
27
- "questions and answers related to bone marrow transplantation."
27
+ "BMT-Status is a benchmark composed of clinical notes and associated binary questions"
28
+ "related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
29
+ "or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
30
+ "patient received a subsequent transplant based on the provided clinical documentation."
28
31
  )
29
32
  tags = ["knowledge", "reasoning", "biomedical"]
30
33
 
31
34
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
32
35
 
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
33
40
  def create_benchmark(self, csv_path) -> Dict[str, str]:
34
41
  data = {}
35
42
  with open(csv_path, "r") as file:
@@ -39,7 +46,7 @@ class SHCBMTMedScenario(Scenario):
39
46
  context = row["context"]
40
47
  answer = row["label"]
41
48
  prompt = (
42
- f"Provide an answer to the following {question} with the following context: {context} "
49
+ f"Provide an answer to the following question: {question} with the following context: {context} "
43
50
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
44
51
  "details or response, just a simple A or B response."
45
52
  )
@@ -47,10 +54,9 @@ class SHCBMTMedScenario(Scenario):
47
54
  return data
48
55
 
49
56
  def get_instances(self, output_path: str) -> List[Instance]:
50
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-BMT-dataset_filtered.csv"
51
-
57
+ check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
52
58
  instances: List[Instance] = []
53
- benchmark_data = self.create_benchmark(data_path)
59
+ benchmark_data = self.create_benchmark(self.data_path)
54
60
 
55
61
  for prompt, answer in benchmark_data.items():
56
62
  assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -24,13 +25,18 @@ class SHCCDIMedScenario(Scenario):
24
25
 
25
26
  name = "shc_cdi_med"
26
27
  description = (
27
- "A dataset built from Clinical Document Integrity (CDI) notes, to assess "
28
- "the ability to answer verification questions from previous notes."
28
+ "CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
29
+ "notes. It is used to evaluate a model's ability to verify clinical conditions based on"
30
+ "documented evidence in patient records."
29
31
  )
30
32
  tags = ["knowledge", "reasoning", "biomedical"]
31
33
 
32
34
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
35
 
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
34
40
  def create_benchmark(self, csv_path) -> Dict[str, str]:
35
41
  data = {}
36
42
  with open(csv_path, "r") as file:
@@ -40,7 +46,7 @@ class SHCCDIMedScenario(Scenario):
40
46
  context = row["context"]
41
47
  answer = row["label"]
42
48
  prompt = (
43
- f"Provide an answer to the following {question} with the following context: {context} , "
49
+ f"Provide an answer to the following question: {question} with the following context: {context} , "
44
50
  "Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
45
51
  "additional details or response, just a simple A or B response."
46
52
  )
@@ -48,10 +54,9 @@ class SHCCDIMedScenario(Scenario):
48
54
  return data
49
55
 
50
56
  def get_instances(self, output_path: str) -> List[Instance]:
51
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CDI-dataset_filtered.csv"
52
-
57
+ check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
53
58
  instances: List[Instance] = []
54
- benchmark_data = self.create_benchmark(data_path)
59
+ benchmark_data = self.create_benchmark(self.data_path)
55
60
 
56
61
  for prompt, answer in benchmark_data.items():
57
62
  assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -24,13 +25,19 @@ class SHCCONFMedScenario(Scenario):
24
25
 
25
26
  name = "shc_conf_med"
26
27
  description = (
27
- "A dataset of clinical notes from adolescent patients used to identify sensitive "
28
- "protected health information that should be restricted from parental access."
28
+ "MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
29
+ "used to evaluate whether the content contains sensitive protected health information"
30
+ "(PHI) that should be restricted from parental access, in accordance with adolescent"
31
+ "confidentiality policies in clinical care."
29
32
  )
30
33
  tags = ["knowledge", "reasoning", "biomedical"]
31
34
 
32
35
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
36
 
37
+ def __init__(self, data_path: str):
38
+ super().__init__()
39
+ self.data_path = data_path
40
+
34
41
  def create_benchmark(self, csv_path) -> Dict[str, str]:
35
42
  data = {}
36
43
  with open(csv_path, "r") as file:
@@ -40,7 +47,7 @@ class SHCCONFMedScenario(Scenario):
40
47
  context = row["context"]
41
48
  answer = row["label"]
42
49
  prompt = (
43
- f"Provide an answer to the following {question} with the following context: {context} "
50
+ f"Provide an answer to the following question: {question} with the following context: {context} "
44
51
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
45
52
  "details or response, just a simple A or B response."
46
53
  )
@@ -48,10 +55,9 @@ class SHCCONFMedScenario(Scenario):
48
55
  return data
49
56
 
50
57
  def get_instances(self, output_path: str) -> List[Instance]:
51
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CONF-dataset_filtered.csv"
52
-
58
+ check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
53
59
  instances: List[Instance] = []
54
- benchmark_data = self.create_benchmark(data_path)
60
+ benchmark_data = self.create_benchmark(self.data_path)
55
61
 
56
62
  for prompt, answer in benchmark_data.items():
57
63
  assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -23,13 +24,18 @@ class SHCENTMedScenario(Scenario):
23
24
 
24
25
  name = "shc_ent_med"
25
26
  description = (
26
- "A dataset designed to evaluate performance in "
27
- "identifying appropriate patient referrals to Ear, Nose, and Throat specialists."
27
+ "ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
28
+ "supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
29
+ "models' abilities to make referral decisions based on unstructured clinical text."
28
30
  )
29
31
  tags = ["knowledge", "reasoning", "biomedical"]
30
32
 
31
33
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
32
34
 
35
+ def __init__(self, data_path: str):
36
+ super().__init__()
37
+ self.data_path = data_path
38
+
33
39
  def create_benchmark(self, csv_path) -> Dict[str, str]:
34
40
  data = {}
35
41
  counter = 1
@@ -41,7 +47,7 @@ class SHCENTMedScenario(Scenario):
41
47
  context = row["context"]
42
48
  answer = row["label"]
43
49
  prompt = (
44
- f"{counter} Provide an answer to the following {question} with the following context:"
50
+ f"{counter} Provide an answer to the following question: {question} with the following context:"
45
51
  f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
46
52
  " Do not provide any additional details or response, just a simple A, B, or C response."
47
53
  )
@@ -50,10 +56,9 @@ class SHCENTMedScenario(Scenario):
50
56
  return data
51
57
 
52
58
  def get_instances(self, output_path: str) -> List[Instance]:
53
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-ENT-dataset_filtered.csv"
54
-
59
+ check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
55
60
  instances: List[Instance] = []
56
- benchmark_data = self.create_benchmark(data_path)
61
+ benchmark_data = self.create_benchmark(self.data_path)
57
62
 
58
63
  for prompt, answer in benchmark_data.items():
59
64
  assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -22,11 +23,19 @@ class SHCGIPMedScenario(Scenario):
22
23
  """
23
24
 
24
25
  name = "shc_gip_med"
25
- description = "A dataset evaluating performance in identifying appropriate patient referrals to hospice care."
26
+ description = (
27
+ "HospiceReferral is a benchmark that evaluates model performance in identifying"
28
+ "whether patients are eligible for hospice care based on palliative care clinical notes."
29
+ "The benchmark focuses on end-of-life care referral decisions."
30
+ )
26
31
  tags = ["knowledge", "reasoning", "biomedical"]
27
32
 
28
33
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
29
34
 
35
+ def __init__(self, data_path: str):
36
+ super().__init__()
37
+ self.data_path = data_path
38
+
30
39
  def create_benchmark(self, csv_path) -> Dict[str, str]:
31
40
  data = {}
32
41
  with open(csv_path, "r") as file:
@@ -36,7 +45,7 @@ class SHCGIPMedScenario(Scenario):
36
45
  context = row["context"]
37
46
  answer = row["label"]
38
47
  prompt = (
39
- f"Provide an answer to the following {question} with the following context: {context} "
48
+ f"Provide an answer to the following question: {question} with the following context: {context} "
40
49
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
41
50
  "details or response, just a simple A or B response."
42
51
  )
@@ -44,10 +53,9 @@ class SHCGIPMedScenario(Scenario):
44
53
  return data
45
54
 
46
55
  def get_instances(self, output_path: str) -> List[Instance]:
47
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-GIP-dataset_filtered.csv"
48
-
56
+ check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
49
57
  instances: List[Instance] = []
50
- benchmark_data = self.create_benchmark(data_path)
58
+ benchmark_data = self.create_benchmark(self.data_path)
51
59
 
52
60
  for prompt, answer in benchmark_data.items():
53
61
  assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -0,0 +1,78 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+ from helm.common.general import check_file_exists
15
+
16
+ csv.field_size_limit(sys.maxsize)
17
+
18
+
19
+ class SHCPRIVACYMedScenario(Scenario):
20
+ """
21
+ This dataset features messages sent generated by an LLM from patient clinical notes data.
22
+ The scenario evaluates the ability of an LLM to determine if any potentially confidential
23
+ information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
24
+ """
25
+
26
+ name = "shc_privacy_med"
27
+ description = (
28
+ "PrivacyDetection is a benchmark composed of patient portal messages submitted by"
29
+ "patients or caregivers. The task is to determine whether the message contains any"
30
+ "confidential or privacy-leaking information that should be protected."
31
+ )
32
+ tags = ["knowledge", "reasoning", "biomedical"]
33
+
34
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
35
+
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
40
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
41
+ data = {}
42
+ with open(csv_path, "r") as file:
43
+ reader = csv.DictReader(file)
44
+ for row in reader:
45
+ question = row["prompt"]
46
+ context = row["context"]
47
+ answer = row["label"]
48
+ prompt = (
49
+ "You are reviewing clinical messages in order to determine if any confidential "
50
+ f"information was included. Please answer the following question: {question} with the "
51
+ f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
52
+ "for no. Do not provide any additional details or response, just a simple A or B response."
53
+ )
54
+ data[prompt] = answer
55
+ return data
56
+
57
+ def get_instances(self, output_path: str) -> List[Instance]:
58
+ check_file_exists(
59
+ self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
60
+ )
61
+ instances: List[Instance] = []
62
+ benchmark_data = self.create_benchmark(self.data_path)
63
+
64
+ for prompt, answer in benchmark_data.items():
65
+ assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
66
+ references: List[Reference] = [
67
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
68
+ for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
69
+ ]
70
+ instances.append(
71
+ Instance(
72
+ input=Input(text=prompt),
73
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
74
+ split=TEST_SPLIT,
75
+ )
76
+ )
77
+
78
+ return instances
@@ -0,0 +1,76 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+ from helm.common.general import check_file_exists
15
+
16
+ csv.field_size_limit(sys.maxsize)
17
+
18
+
19
+ class SHCPROXYMedScenario(Scenario):
20
+ """
21
+ This dataset features messages sent by proxy users and non proxy users, for evaluation of
22
+ LLM capabilities to determine the sender. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
23
+ """
24
+
25
+ name = "shc_proxy_med"
26
+ description = (
27
+ "ProxySender is a benchmark composed of patient portal messages received by clinicians."
28
+ "It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent,"
29
+ "spouse), which is critical for understanding who is communicating with healthcare"
30
+ "providers."
31
+ )
32
+ tags = ["knowledge", "reasoning", "biomedical"]
33
+
34
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
35
+
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
40
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
41
+ data = {}
42
+ with open(csv_path, "r") as file:
43
+ reader = csv.DictReader(file)
44
+ for row in reader:
45
+ question = row["prompt"]
46
+ context = row["context"]
47
+ answer = row["label"]
48
+ prompt = (
49
+ "You are reviewing a clinical messages in order to determine if they have been "
50
+ f"sent by a proxy user. Please determine the following: {question} with the "
51
+ f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
52
+ "for no. Do not provide any additional details or response, just a simple A or B response."
53
+ )
54
+ data[prompt] = answer
55
+ return data
56
+
57
+ def get_instances(self, output_path: str) -> List[Instance]:
58
+ check_file_exists(self.data_path, msg=f"[SHCPROXYMedScenario] Required data file not found: '{self.data_path}'")
59
+ instances: List[Instance] = []
60
+ benchmark_data = self.create_benchmark(self.data_path)
61
+
62
+ for prompt, answer in benchmark_data.items():
63
+ assert answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
64
+ references: List[Reference] = [
65
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
66
+ for pred_answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
67
+ ]
68
+ instances.append(
69
+ Instance(
70
+ input=Input(text=prompt),
71
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
72
+ split=TEST_SPLIT,
73
+ )
74
+ )
75
+
76
+ return instances
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -27,14 +28,19 @@ class SHCPTBMMedScenario(Scenario):
27
28
 
28
29
  name = "shc_ptbm_med"
29
30
  description = (
30
- "A dataset that classifies whether a clinical note contains a clinician "
31
- "recommendation for parent training in behavior management, which is the first-line "
32
- "evidence-based treatment for young children with ADHD."
31
+ "ADHD-Behavior is a benchmark that evaluates a model’s ability to detect whether"
32
+ "a clinician recommends parent training in behavior management, an evidence-based"
33
+ "first-line treatment for young children diagnosed with ADHD. Each instance includes"
34
+ "a clinical note from a pediatric visit and a binary classification task."
33
35
  )
34
36
  tags = ["knowledge", "reasoning", "biomedical"]
35
37
 
36
38
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
37
39
 
40
+ def __init__(self, data_path: str):
41
+ super().__init__()
42
+ self.data_path = data_path
43
+
38
44
  def create_benchmark(self, csv_path) -> Dict[str, str]:
39
45
  data = {}
40
46
  with open(csv_path, "r") as file:
@@ -46,7 +52,7 @@ class SHCPTBMMedScenario(Scenario):
46
52
  prompt = (
47
53
  "You are reviewing a clinical note from health records of children with "
48
54
  "attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
49
- f"behavioral therapy. Provide an answer to the following {question} with the "
55
+ f"behavioral therapy. Provide an answer to the following question: {question} with the "
50
56
  f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
51
57
  "for no. Do not provide any additional details or response, just a simple A or B response."
52
58
  )
@@ -54,10 +60,9 @@ class SHCPTBMMedScenario(Scenario):
54
60
  return data
55
61
 
56
62
  def get_instances(self, output_path: str) -> List[Instance]:
57
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-PTBM-dataset_filtered.csv"
58
-
63
+ check_file_exists(self.data_path, msg=f"[SHCPTBMMedScenario] Required data file not found: '{self.data_path}'")
59
64
  instances: List[Instance] = []
60
- benchmark_data = self.create_benchmark(data_path)
65
+ benchmark_data = self.create_benchmark(self.data_path)
61
66
 
62
67
  for prompt, answer in benchmark_data.items():
63
68
  assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -30,14 +31,19 @@ class SHCSEIMedScenario(Scenario):
30
31
 
31
32
  name = "shc_sei_med"
32
33
  description = (
33
- "A dataset that classifies whether a clinical note contains documentation "
34
- "of side effect monitoring (recording of absence or presence of medication "
35
- "side effects), as recommended in clinical practice guidelines."
34
+ "ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
35
+ "pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
36
+ "in clinical practice guidelines. The dataset supports binary classification"
37
+ "to detect presence or absence of side effect inquiries (SEI) within notes."
36
38
  )
37
39
  tags = ["knowledge", "reasoning", "biomedical"]
38
40
 
39
41
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
40
42
 
43
+ def __init__(self, data_path: str):
44
+ super().__init__()
45
+ self.data_path = data_path
46
+
41
47
  def create_benchmark(self, csv_path) -> Dict[str, str]:
42
48
  data = {}
43
49
  with open(csv_path, "r") as file:
@@ -59,7 +65,7 @@ class SHCSEIMedScenario(Scenario):
59
65
  "categorized as SEI because they consist of a plan or an explanation about "
60
66
  "side effects without actual side effect monitoring taking place, and "
61
67
  "No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
62
- f"Provide an answer to the following {question} with the following context: {context} "
68
+ f"Provide an answer to the following question: {question} with the following context: {context} "
63
69
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
64
70
  "details or response, just a simple A or B response."
65
71
  )
@@ -67,10 +73,9 @@ class SHCSEIMedScenario(Scenario):
67
73
  return data
68
74
 
69
75
  def get_instances(self, output_path: str) -> List[Instance]:
70
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-SEI-dataset_filtered.csv"
71
-
76
+ check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
72
77
  instances: List[Instance] = []
73
- benchmark_data = self.create_benchmark(data_path)
78
+ benchmark_data = self.create_benchmark(self.data_path)
74
79
 
75
80
  for prompt, answer in benchmark_data.items():
76
81
  assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -22,12 +23,18 @@ class SHCSequoiaMedScenario(Scenario):
22
23
 
23
24
  name = "shc_sequoia_med"
24
25
  description = (
25
- "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic."
26
+ "ClinicReferral is a benchmark that determines patient eligibility for referral to the"
27
+ "Sequoia Clinic based on information from palliative care notes. The dataset provides"
28
+ "curated decisions on referral appropriateness to assist in automating clinic workflows."
26
29
  )
27
30
  tags = ["knowledge", "reasoning", "biomedical"]
28
31
 
29
32
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
30
33
 
34
+ def __init__(self, data_path: str):
35
+ super().__init__()
36
+ self.data_path = data_path
37
+
31
38
  def create_benchmark(self, csv_path) -> Dict[str, str]:
32
39
  data = {}
33
40
  counter = 1
@@ -38,7 +45,7 @@ class SHCSequoiaMedScenario(Scenario):
38
45
  context = row["context"]
39
46
  answer = row["label"]
40
47
  prompt = (
41
- f" {counter} Provide an answer to the following {question} with the following context:"
48
+ f" {counter} Provide an answer to the following question: {question} with the following context:"
42
49
  f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
43
50
  "additional details or response, just a simple A or B response."
44
51
  )
@@ -47,10 +54,11 @@ class SHCSequoiaMedScenario(Scenario):
47
54
  return data
48
55
 
49
56
  def get_instances(self, output_path: str) -> List[Instance]:
50
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-sequoia-dataset_filtered.csv"
51
-
57
+ check_file_exists(
58
+ self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
59
+ )
52
60
  instances: List[Instance] = []
53
- benchmark_data = self.create_benchmark(data_path)
61
+ benchmark_data = self.create_benchmark(self.data_path)
54
62
 
55
63
  for prompt, answer in benchmark_data.items():
56
64
  assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES