crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -23,13 +24,19 @@ class SHCBMTMedScenario(Scenario):
23
24
 
24
25
  name = "shc_bmt_med"
25
26
  description = (
26
- "A dataset containing patient notes with associated "
27
- "questions and answers related to bone marrow transplantation."
27
+ "BMT-Status is a benchmark composed of clinical notes and associated binary questions"
28
+ "related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
29
+ "or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
30
+ "patient received a subsequent transplant based on the provided clinical documentation."
28
31
  )
29
32
  tags = ["knowledge", "reasoning", "biomedical"]
30
33
 
31
34
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
32
35
 
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
33
40
  def create_benchmark(self, csv_path) -> Dict[str, str]:
34
41
  data = {}
35
42
  with open(csv_path, "r") as file:
@@ -39,7 +46,7 @@ class SHCBMTMedScenario(Scenario):
39
46
  context = row["context"]
40
47
  answer = row["label"]
41
48
  prompt = (
42
- f"Provide an answer to the following {question} with the following context: {context} "
49
+ f"Provide an answer to the following question: {question} with the following context: {context} "
43
50
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
44
51
  "details or response, just a simple A or B response."
45
52
  )
@@ -47,10 +54,9 @@ class SHCBMTMedScenario(Scenario):
47
54
  return data
48
55
 
49
56
  def get_instances(self, output_path: str) -> List[Instance]:
50
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-BMT-dataset_filtered.csv"
51
-
57
+ check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
52
58
  instances: List[Instance] = []
53
- benchmark_data = self.create_benchmark(data_path)
59
+ benchmark_data = self.create_benchmark(self.data_path)
54
60
 
55
61
  for prompt, answer in benchmark_data.items():
56
62
  assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -24,13 +25,18 @@ class SHCCDIMedScenario(Scenario):
24
25
 
25
26
  name = "shc_cdi_med"
26
27
  description = (
27
- "A dataset built from Clinical Document Integrity (CDI) notes, to assess "
28
- "the ability to answer verification questions from previous notes."
28
+ "CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
29
+ "notes. It is used to evaluate a model's ability to verify clinical conditions based on"
30
+ "documented evidence in patient records."
29
31
  )
30
32
  tags = ["knowledge", "reasoning", "biomedical"]
31
33
 
32
34
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
35
 
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
34
40
  def create_benchmark(self, csv_path) -> Dict[str, str]:
35
41
  data = {}
36
42
  with open(csv_path, "r") as file:
@@ -40,7 +46,7 @@ class SHCCDIMedScenario(Scenario):
40
46
  context = row["context"]
41
47
  answer = row["label"]
42
48
  prompt = (
43
- f"Provide an answer to the following {question} with the following context: {context} , "
49
+ f"Provide an answer to the following question: {question} with the following context: {context} , "
44
50
  "Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
45
51
  "additional details or response, just a simple A or B response."
46
52
  )
@@ -48,10 +54,9 @@ class SHCCDIMedScenario(Scenario):
48
54
  return data
49
55
 
50
56
  def get_instances(self, output_path: str) -> List[Instance]:
51
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CDI-dataset_filtered.csv"
52
-
57
+ check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
53
58
  instances: List[Instance] = []
54
- benchmark_data = self.create_benchmark(data_path)
59
+ benchmark_data = self.create_benchmark(self.data_path)
55
60
 
56
61
  for prompt, answer in benchmark_data.items():
57
62
  assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -24,13 +25,19 @@ class SHCCONFMedScenario(Scenario):
24
25
 
25
26
  name = "shc_conf_med"
26
27
  description = (
27
- "A dataset of clinical notes from adolescent patients used to identify sensitive "
28
- "protected health information that should be restricted from parental access."
28
+ "MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
29
+ "used to evaluate whether the content contains sensitive protected health information"
30
+ "(PHI) that should be restricted from parental access, in accordance with adolescent"
31
+ "confidentiality policies in clinical care."
29
32
  )
30
33
  tags = ["knowledge", "reasoning", "biomedical"]
31
34
 
32
35
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
33
36
 
37
+ def __init__(self, data_path: str):
38
+ super().__init__()
39
+ self.data_path = data_path
40
+
34
41
  def create_benchmark(self, csv_path) -> Dict[str, str]:
35
42
  data = {}
36
43
  with open(csv_path, "r") as file:
@@ -40,7 +47,7 @@ class SHCCONFMedScenario(Scenario):
40
47
  context = row["context"]
41
48
  answer = row["label"]
42
49
  prompt = (
43
- f"Provide an answer to the following {question} with the following context: {context} "
50
+ f"Provide an answer to the following question: {question} with the following context: {context} "
44
51
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
45
52
  "details or response, just a simple A or B response."
46
53
  )
@@ -48,10 +55,9 @@ class SHCCONFMedScenario(Scenario):
48
55
  return data
49
56
 
50
57
  def get_instances(self, output_path: str) -> List[Instance]:
51
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CONF-dataset_filtered.csv"
52
-
58
+ check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
53
59
  instances: List[Instance] = []
54
- benchmark_data = self.create_benchmark(data_path)
60
+ benchmark_data = self.create_benchmark(self.data_path)
55
61
 
56
62
  for prompt, answer in benchmark_data.items():
57
63
  assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -23,13 +24,18 @@ class SHCENTMedScenario(Scenario):
23
24
 
24
25
  name = "shc_ent_med"
25
26
  description = (
26
- "A dataset designed to evaluate performance in "
27
- "identifying appropriate patient referrals to Ear, Nose, and Throat specialists."
27
+ "ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
28
+ "supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
29
+ "models' abilities to make referral decisions based on unstructured clinical text."
28
30
  )
29
31
  tags = ["knowledge", "reasoning", "biomedical"]
30
32
 
31
33
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
32
34
 
35
+ def __init__(self, data_path: str):
36
+ super().__init__()
37
+ self.data_path = data_path
38
+
33
39
  def create_benchmark(self, csv_path) -> Dict[str, str]:
34
40
  data = {}
35
41
  counter = 1
@@ -41,7 +47,7 @@ class SHCENTMedScenario(Scenario):
41
47
  context = row["context"]
42
48
  answer = row["label"]
43
49
  prompt = (
44
- f"{counter} Provide an answer to the following {question} with the following context:"
50
+ f"{counter} Provide an answer to the following question: {question} with the following context:"
45
51
  f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
46
52
  " Do not provide any additional details or response, just a simple A, B, or C response."
47
53
  )
@@ -50,10 +56,9 @@ class SHCENTMedScenario(Scenario):
50
56
  return data
51
57
 
52
58
  def get_instances(self, output_path: str) -> List[Instance]:
53
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-ENT-dataset_filtered.csv"
54
-
59
+ check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
55
60
  instances: List[Instance] = []
56
- benchmark_data = self.create_benchmark(data_path)
61
+ benchmark_data = self.create_benchmark(self.data_path)
57
62
 
58
63
  for prompt, answer in benchmark_data.items():
59
64
  assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -22,11 +23,19 @@ class SHCGIPMedScenario(Scenario):
22
23
  """
23
24
 
24
25
  name = "shc_gip_med"
25
- description = "A dataset evaluating performance in identifying appropriate patient referrals to hospice care."
26
+ description = (
27
+ "HospiceReferral is a benchmark that evaluates model performance in identifying"
28
+ "whether patients are eligible for hospice care based on palliative care clinical notes."
29
+ "The benchmark focuses on end-of-life care referral decisions."
30
+ )
26
31
  tags = ["knowledge", "reasoning", "biomedical"]
27
32
 
28
33
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
29
34
 
35
+ def __init__(self, data_path: str):
36
+ super().__init__()
37
+ self.data_path = data_path
38
+
30
39
  def create_benchmark(self, csv_path) -> Dict[str, str]:
31
40
  data = {}
32
41
  with open(csv_path, "r") as file:
@@ -36,7 +45,7 @@ class SHCGIPMedScenario(Scenario):
36
45
  context = row["context"]
37
46
  answer = row["label"]
38
47
  prompt = (
39
- f"Provide an answer to the following {question} with the following context: {context} "
48
+ f"Provide an answer to the following question: {question} with the following context: {context} "
40
49
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
41
50
  "details or response, just a simple A or B response."
42
51
  )
@@ -44,10 +53,9 @@ class SHCGIPMedScenario(Scenario):
44
53
  return data
45
54
 
46
55
  def get_instances(self, output_path: str) -> List[Instance]:
47
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-GIP-dataset_filtered.csv"
48
-
56
+ check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
49
57
  instances: List[Instance] = []
50
- benchmark_data = self.create_benchmark(data_path)
58
+ benchmark_data = self.create_benchmark(self.data_path)
51
59
 
52
60
  for prompt, answer in benchmark_data.items():
53
61
  assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -0,0 +1,78 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+ from helm.common.general import check_file_exists
15
+
16
+ csv.field_size_limit(sys.maxsize)
17
+
18
+
19
+ class SHCPRIVACYMedScenario(Scenario):
20
+ """
21
+ This dataset features messages sent generated by an LLM from patient clinical notes data.
22
+ The scenario evaluates the ability of an LLM to determine if any potentially confidential
23
+ information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
24
+ """
25
+
26
+ name = "shc_privacy_med"
27
+ description = (
28
+ "PrivacyDetection is a benchmark composed of patient portal messages submitted by"
29
+ "patients or caregivers. The task is to determine whether the message contains any"
30
+ "confidential or privacy-leaking information that should be protected."
31
+ )
32
+ tags = ["knowledge", "reasoning", "biomedical"]
33
+
34
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
35
+
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
40
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
41
+ data = {}
42
+ with open(csv_path, "r") as file:
43
+ reader = csv.DictReader(file)
44
+ for row in reader:
45
+ question = row["prompt"]
46
+ context = row["context"]
47
+ answer = row["label"]
48
+ prompt = (
49
+ "You are reviewing clinical messages in order to determine if any confidential "
50
+ f"information was included. Please answer the following question: {question} with the "
51
+ f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
52
+ "for no. Do not provide any additional details or response, just a simple A or B response."
53
+ )
54
+ data[prompt] = answer
55
+ return data
56
+
57
+ def get_instances(self, output_path: str) -> List[Instance]:
58
+ check_file_exists(
59
+ self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
60
+ )
61
+ instances: List[Instance] = []
62
+ benchmark_data = self.create_benchmark(self.data_path)
63
+
64
+ for prompt, answer in benchmark_data.items():
65
+ assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
66
+ references: List[Reference] = [
67
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
68
+ for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
69
+ ]
70
+ instances.append(
71
+ Instance(
72
+ input=Input(text=prompt),
73
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
74
+ split=TEST_SPLIT,
75
+ )
76
+ )
77
+
78
+ return instances
@@ -0,0 +1,76 @@
1
+ import sys
2
+ import csv
3
+ from typing import Dict, List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Input,
7
+ Scenario,
8
+ Instance,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ Output,
13
+ )
14
+ from helm.common.general import check_file_exists
15
+
16
+ csv.field_size_limit(sys.maxsize)
17
+
18
+
19
+ class SHCPROXYMedScenario(Scenario):
20
+ """
21
+ This dataset features messages sent by proxy users and non proxy users, for evaluation of
22
+ LLM capabilities to determine the sender. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
23
+ """
24
+
25
+ name = "shc_proxy_med"
26
+ description = (
27
+ "ProxySender is a benchmark composed of patient portal messages received by clinicians."
28
+ "It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent,"
29
+ "spouse), which is critical for understanding who is communicating with healthcare"
30
+ "providers."
31
+ )
32
+ tags = ["knowledge", "reasoning", "biomedical"]
33
+
34
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
35
+
36
+ def __init__(self, data_path: str):
37
+ super().__init__()
38
+ self.data_path = data_path
39
+
40
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
41
+ data = {}
42
+ with open(csv_path, "r") as file:
43
+ reader = csv.DictReader(file)
44
+ for row in reader:
45
+ question = row["prompt"]
46
+ context = row["context"]
47
+ answer = row["label"]
48
+ prompt = (
49
+ "You are reviewing a clinical messages in order to determine if they have been "
50
+ f"sent by a proxy user. Please determine the following: {question} with the "
51
+ f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
52
+ "for no. Do not provide any additional details or response, just a simple A or B response."
53
+ )
54
+ data[prompt] = answer
55
+ return data
56
+
57
+ def get_instances(self, output_path: str) -> List[Instance]:
58
+ check_file_exists(self.data_path, msg=f"[SHCPROXYMedScenario] Required data file not found: '{self.data_path}'")
59
+ instances: List[Instance] = []
60
+ benchmark_data = self.create_benchmark(self.data_path)
61
+
62
+ for prompt, answer in benchmark_data.items():
63
+ assert answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
64
+ references: List[Reference] = [
65
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
66
+ for pred_answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
67
+ ]
68
+ instances.append(
69
+ Instance(
70
+ input=Input(text=prompt),
71
+ references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
72
+ split=TEST_SPLIT,
73
+ )
74
+ )
75
+
76
+ return instances
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -27,14 +28,19 @@ class SHCPTBMMedScenario(Scenario):
27
28
 
28
29
  name = "shc_ptbm_med"
29
30
  description = (
30
- "A dataset that classifies whether a clinical note contains a clinician "
31
- "recommendation for parent training in behavior management, which is the first-line "
32
- "evidence-based treatment for young children with ADHD."
31
+ "ADHD-Behavior is a benchmark that evaluates a model’s ability to detect whether"
32
+ "a clinician recommends parent training in behavior management, an evidence-based"
33
+ "first-line treatment for young children diagnosed with ADHD. Each instance includes"
34
+ "a clinical note from a pediatric visit and a binary classification task."
33
35
  )
34
36
  tags = ["knowledge", "reasoning", "biomedical"]
35
37
 
36
38
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
37
39
 
40
+ def __init__(self, data_path: str):
41
+ super().__init__()
42
+ self.data_path = data_path
43
+
38
44
  def create_benchmark(self, csv_path) -> Dict[str, str]:
39
45
  data = {}
40
46
  with open(csv_path, "r") as file:
@@ -46,7 +52,7 @@ class SHCPTBMMedScenario(Scenario):
46
52
  prompt = (
47
53
  "You are reviewing a clinical note from health records of children with "
48
54
  "attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
49
- f"behavioral therapy. Provide an answer to the following {question} with the "
55
+ f"behavioral therapy. Provide an answer to the following question: {question} with the "
50
56
  f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
51
57
  "for no. Do not provide any additional details or response, just a simple A or B response."
52
58
  )
@@ -54,10 +60,9 @@ class SHCPTBMMedScenario(Scenario):
54
60
  return data
55
61
 
56
62
  def get_instances(self, output_path: str) -> List[Instance]:
57
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-PTBM-dataset_filtered.csv"
58
-
63
+ check_file_exists(self.data_path, msg=f"[SHCPTBMMedScenario] Required data file not found: '{self.data_path}'")
59
64
  instances: List[Instance] = []
60
- benchmark_data = self.create_benchmark(data_path)
65
+ benchmark_data = self.create_benchmark(self.data_path)
61
66
 
62
67
  for prompt, answer in benchmark_data.items():
63
68
  assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -30,14 +31,19 @@ class SHCSEIMedScenario(Scenario):
30
31
 
31
32
  name = "shc_sei_med"
32
33
  description = (
33
- "A dataset that classifies whether a clinical note contains documentation "
34
- "of side effect monitoring (recording of absence or presence of medication "
35
- "side effects), as recommended in clinical practice guidelines."
34
+ "ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
35
+ "pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
36
+ "in clinical practice guidelines. The dataset supports binary classification"
37
+ "to detect presence or absence of side effect inquiries (SEI) within notes."
36
38
  )
37
39
  tags = ["knowledge", "reasoning", "biomedical"]
38
40
 
39
41
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
40
42
 
43
+ def __init__(self, data_path: str):
44
+ super().__init__()
45
+ self.data_path = data_path
46
+
41
47
  def create_benchmark(self, csv_path) -> Dict[str, str]:
42
48
  data = {}
43
49
  with open(csv_path, "r") as file:
@@ -59,7 +65,7 @@ class SHCSEIMedScenario(Scenario):
59
65
  "categorized as SEI because they consist of a plan or an explanation about "
60
66
  "side effects without actual side effect monitoring taking place, and "
61
67
  "No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
62
- f"Provide an answer to the following {question} with the following context: {context} "
68
+ f"Provide an answer to the following question: {question} with the following context: {context} "
63
69
  ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
64
70
  "details or response, just a simple A or B response."
65
71
  )
@@ -67,10 +73,9 @@ class SHCSEIMedScenario(Scenario):
67
73
  return data
68
74
 
69
75
  def get_instances(self, output_path: str) -> List[Instance]:
70
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-SEI-dataset_filtered.csv"
71
-
76
+ check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
72
77
  instances: List[Instance] = []
73
- benchmark_data = self.create_benchmark(data_path)
78
+ benchmark_data = self.create_benchmark(self.data_path)
74
79
 
75
80
  for prompt, answer in benchmark_data.items():
76
81
  assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -11,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
11
11
  Reference,
12
12
  Output,
13
13
  )
14
+ from helm.common.general import check_file_exists
14
15
 
15
16
  csv.field_size_limit(sys.maxsize)
16
17
 
@@ -22,12 +23,18 @@ class SHCSequoiaMedScenario(Scenario):
22
23
 
23
24
  name = "shc_sequoia_med"
24
25
  description = (
25
- "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic."
26
+ "ClinicReferral is a benchmark that determines patient eligibility for referral to the"
27
+ "Sequoia Clinic based on information from palliative care notes. The dataset provides"
28
+ "curated decisions on referral appropriateness to assist in automating clinic workflows."
26
29
  )
27
30
  tags = ["knowledge", "reasoning", "biomedical"]
28
31
 
29
32
  POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
30
33
 
34
+ def __init__(self, data_path: str):
35
+ super().__init__()
36
+ self.data_path = data_path
37
+
31
38
  def create_benchmark(self, csv_path) -> Dict[str, str]:
32
39
  data = {}
33
40
  counter = 1
@@ -38,7 +45,7 @@ class SHCSequoiaMedScenario(Scenario):
38
45
  context = row["context"]
39
46
  answer = row["label"]
40
47
  prompt = (
41
- f" {counter} Provide an answer to the following {question} with the following context:"
48
+ f" {counter} Provide an answer to the following question: {question} with the following context:"
42
49
  f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
43
50
  "additional details or response, just a simple A or B response."
44
51
  )
@@ -47,10 +54,11 @@ class SHCSequoiaMedScenario(Scenario):
47
54
  return data
48
55
 
49
56
  def get_instances(self, output_path: str) -> List[Instance]:
50
- data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-sequoia-dataset_filtered.csv"
51
-
57
+ check_file_exists(
58
+ self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
59
+ )
52
60
  instances: List[Instance] = []
53
- benchmark_data = self.create_benchmark(data_path)
61
+ benchmark_data = self.create_benchmark(self.data_path)
54
62
 
55
63
  for prompt, answer in benchmark_data.items():
56
64
  assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
@@ -1,8 +1,7 @@
1
- import os
2
1
  import csv
3
2
  from typing import List
4
3
 
5
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
6
5
  from helm.benchmark.scenarios.scenario import (
7
6
  Input,
8
7
  Scenario,
@@ -40,19 +39,27 @@ class StarrPatientInstructionsScenario(Scenario):
40
39
  """
41
40
 
42
41
  name = "starr_patient_instructions"
43
- description = "A dataset containing case details used to generate customized post-procedure patient instructions."
42
+ description = (
43
+ "PatientInstruct is a benchmark designed to evaluate models on generating personalized"
44
+ "post-procedure instructions for patients. It includes real-world patient History & Physical"
45
+ "Note (H&P) and operative report, from which models must produce clear, actionable instructions"
46
+ "appropriate for patients recovering from medical interventions."
47
+ )
44
48
  tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
45
49
 
46
- def get_instances(self, output_path: str) -> List[Instance]:
47
- csv_path = "/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv"
48
- # Ensure the directory for the CSV file exists.
49
- ensure_directory_exists(os.path.dirname(csv_path))
50
+ def __init__(self, data_path: str):
51
+ super().__init__()
52
+ self.data_path = data_path
50
53
 
54
+ def get_instances(self, output_path: str) -> List[Instance]:
55
+ check_file_exists(
56
+ self.data_path, msg=f"[StarrPatientInstructiosScenario] Required data file not found: '{self.data_path}'"
57
+ )
51
58
  instances: List[Instance] = []
52
59
  # For now, we assign all instances to the test split (zero-shot setting).
53
60
  split = TEST_SPLIT
54
61
 
55
- with open(csv_path, "r", encoding="utf-8") as csvfile:
62
+ with open(self.data_path, "r", encoding="utf-8") as csvfile:
56
63
  reader = csv.DictReader(csvfile)
57
64
  for row in reader:
58
65
  # Retrieve and strip the relevant fields.
@@ -0,0 +1,18 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.infinite_bench_en_qa_scenario import InfiniteBenchEnQAScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_infinite_bench_en_qa_scenario():
10
+ with TemporaryDirectory() as tmpdir:
11
+ scenario = InfiniteBenchEnQAScenario(max_num_words=10000000)
12
+ instances = scenario.get_instances(tmpdir)
13
+ assert len(instances) == 351
14
+ assert instances[0].split == "test"
15
+ assert len(instances[0].input.text) == 381829
16
+ assert len(instances[0].references) == 1
17
+ assert len(instances[0].references[0].output.text) == 8
18
+ assert instances[0].references[0].tags == [CORRECT_TAG]