crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,152 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from datasets import load_dataset, Dataset
4
+ from helm.common.hierarchical_logger import htrack_block
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ VALID_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MELTTranslationScenario(Scenario):
19
+ name = "melt_translation"
20
+ description = "Machine Translation scenario."
21
+ tags = ["machine_translation"]
22
+
23
+ def __init__(
24
+ self,
25
+ dataset_name: str,
26
+ revision: str,
27
+ source_language: str,
28
+ target_language: str,
29
+ subset: Optional[str] = None,
30
+ splits: Optional[Dict[str, str]] = None,
31
+ ):
32
+ """Initializes the question answering scenario.
33
+
34
+ Args:
35
+ dataset_name: The name of the dataset.
36
+ revision: The revision of the dataset to use.
37
+ source_language: The source language to use.
38
+ target_language: The target language to use.
39
+ subset: The subset of the dataset to use. Defaults to "".
40
+ splits: The splits to use for the dataset. Defaults to None.
41
+ """
42
+ super().__init__()
43
+ self.MAX_TRAIN_INSTANCES = 20_000
44
+ valid_languages = set(["vi", "en"])
45
+ self.dataset_name = dataset_name
46
+ self.subset = subset
47
+ self.revision = revision
48
+ self.splits = splits
49
+ self.source_language = source_language
50
+ self.target_language = target_language
51
+ if self.source_language not in valid_languages or self.target_language not in valid_languages:
52
+ raise ValueError("Supported languages: vi, en.")
53
+ if self.source_language == self.target_language:
54
+ raise ValueError("The source language and the target language should be different.")
55
+ if self.source_language != "en" and self.target_language != "en":
56
+ raise ValueError("One of the languages should be English.")
57
+
58
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
59
+ """
60
+ Helper for generating instances for a split.
61
+ Args:
62
+ splits (dict): Which splits to partition the data into.
63
+ Returns:
64
+ List[Instance]: Instances from the file for the specified split.
65
+ """
66
+ with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
67
+ hf_dataset: Any = load_dataset(
68
+ self.dataset_name,
69
+ self.subset,
70
+ revision=self.revision,
71
+ trust_remote_code=True,
72
+ )
73
+
74
+ instances: List[Instance] = []
75
+
76
+ for dataset_split_name, helm_split_name in splits.items():
77
+ if helm_split_name == TRAIN_SPLIT:
78
+ hf_dataset[dataset_split_name] = hf_dataset[dataset_split_name].shuffle(seed=42)[
79
+ : self.MAX_TRAIN_INSTANCES
80
+ ]
81
+ hf_dataset[dataset_split_name] = Dataset.from_dict(hf_dataset[dataset_split_name])
82
+
83
+ for example in hf_dataset[dataset_split_name]:
84
+ source_sentence = example[self.source_language]
85
+ target_sentence = example[self.target_language]
86
+ instances.append(
87
+ Instance(
88
+ input=Input(text=source_sentence),
89
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
90
+ split=helm_split_name,
91
+ )
92
+ )
93
+ return instances
94
+
95
+ def get_instances(self, output_path: str) -> List[Instance]:
96
+ if self.splits is None:
97
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
98
+ else:
99
+ splits = {}
100
+ if "train" in self.splits:
101
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
102
+ if "validation" in self.splits:
103
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
104
+ if "test" in self.splits:
105
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
106
+
107
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
108
+ return instances
109
+
110
+
111
+ class MELTTranslationOPUS100Scenario(MELTTranslationScenario):
112
+ """
113
+ Scenario for the OPUS100 dataset.
114
+ """
115
+
116
+ name = "melt_translation_opus100"
117
+ description = "OPUS100 dataset for machine translation."
118
+ tags = ["machine_translation"]
119
+
120
+ def __init__(self, **kwargs):
121
+ super().__init__(
122
+ dataset_name="vietgpt/opus100_envi",
123
+ revision="45df06fb0b31edc882d7c8d34389261f995e5208",
124
+ splits={
125
+ TRAIN_SPLIT: "train",
126
+ VALID_SPLIT: "validation",
127
+ TEST_SPLIT: "test",
128
+ },
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ class MELTTranslationPhoMTScenario(MELTTranslationScenario):
134
+ """
135
+ Scenario for the PhoMT dataset.
136
+ """
137
+
138
+ name = "melt_translation_phomt"
139
+ description = "PhoMT dataset for machine translation."
140
+ tags = ["machine_translation"]
141
+
142
+ def __init__(self, **kwargs):
143
+ super().__init__(
144
+ dataset_name="ura-hcmut/PhoMT",
145
+ revision="74386685db01dc038860ff0a90d9f5fbde284bf7",
146
+ splits={
147
+ TRAIN_SPLIT: "train",
148
+ VALID_SPLIT: "validation",
149
+ TEST_SPLIT: "test",
150
+ },
151
+ **kwargs,
152
+ )
@@ -9,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
9
9
  PassageQuestionInput,
10
10
  Output,
11
11
  )
12
+ from helm.common.general import check_file_exists
12
13
 
13
14
 
14
15
  class MentalHealthScenario(Scenario):
@@ -48,10 +49,19 @@ class MentalHealthScenario(Scenario):
48
49
  """
49
50
 
50
51
  name = "mental_health"
51
- description = "A dataset containing a counselor and mental health patient conversation, where the objective is to \
52
- generate an empathetic counselor response."
52
+ description = (
53
+ "MentalHealth is a benchmark focused on evaluating empathetic communication in"
54
+ "mental health counseling. It includes simulated conversations between patients"
55
+ "and counselors, where the task is to generate compassionate and appropriate counselor"
56
+ "responses. The benchmark assesses a model's ability to support patients emotionally"
57
+ "and meaningfully engage in therapeutic conversations."
58
+ )
53
59
  tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
54
60
 
61
+ def __init__(self, data_path: str):
62
+ super().__init__()
63
+ self.data_path = data_path
64
+
55
65
  def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
56
66
  """
57
67
  Process the dialogue data into evaluation instances.
@@ -102,9 +112,10 @@ class MentalHealthScenario(Scenario):
102
112
  Returns:
103
113
  List[Instance]: List of processed instances for evaluation
104
114
  """
105
- # Load the processed dialogue data
106
- data_path = "/share/pi/nigam/data/medhelm/mental_health/processed_dialogues.csv"
107
- dialogue_data = pd.read_csv(data_path)
115
+ check_file_exists(
116
+ self.data_path, msg=f"[MentalHealthScenario] Required data file not found: '{self.data_path}'"
117
+ )
118
+ dialogue_data = pd.read_csv(self.data_path)
108
119
 
109
120
  # Process into instances
110
121
  instances = self.process_dialogue_data(dialogue_data)
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  from typing import Dict, List
3
3
 
4
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
5
5
  from helm.benchmark.scenarios.scenario import (
6
6
  Input,
7
7
  Scenario,
@@ -59,15 +59,20 @@ class MIMICBHCScenario(Scenario):
59
59
 
60
60
  name = "mimic_bhc"
61
61
  description = (
62
- "A summarization task using a curated collection of preprocessed discharge notes"
63
- " paired with their corresponding brief hospital course (BHC) summaries."
62
+ "MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief"
63
+ "Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV,"
64
+ "each paired with its corresponding BHC summary. The benchmark evaluates a model's"
65
+ "ability to condense detailed clinical information into accurate, concise summaries that"
66
+ "reflect the patient's hospital stay."
64
67
  )
65
68
  tags = ["summarization", "biomedical"]
66
69
 
70
+ def __init__(self, data_path: str):
71
+ super().__init__()
72
+ self.data_path = data_path
73
+
67
74
  def get_instances(self, output_path: str) -> List[Instance]:
68
- data_path = "/share/pi/nigam/data/bhc-mimiciv/"
69
- ensure_directory_exists(data_path)
70
- data_path = data_path + "mimic_iv_bhc.json"
75
+ check_file_exists(self.data_path, msg=f"[MIMICBHCScenario] Required data file not found: '{self.data_path}'")
71
76
 
72
77
  instances: List[Instance] = []
73
78
  # Limit to zero shot setting for now
@@ -77,7 +82,7 @@ class MIMICBHCScenario(Scenario):
77
82
  "test": TEST_SPLIT,
78
83
  }
79
84
 
80
- with open(data_path, "r") as f:
85
+ with open(self.data_path, "r") as f:
81
86
  data = [json.loads(line) for line in f]
82
87
 
83
88
  for data_split, split in splits.items():
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import Dict, List
3
3
 
4
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
5
5
  from helm.benchmark.scenarios.scenario import (
6
6
  Input,
7
7
  Scenario,
@@ -44,15 +44,18 @@ class MIMICRRSScenario(Scenario):
44
44
 
45
45
  name = "mimic_rrs"
46
46
  description = (
47
- "A dataset containing radiology reports with findings sections from MIMIC-III paired with"
48
- " their corresponding impression sections, used for generating radiology report summaries."
47
+ "MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III"
48
+ "database. It contains pairs of 'Findings' and 'Impression' sections, enabling evaluation"
49
+ "of a model's ability to summarize diagnostic imaging observations into concise, clinically"
50
+ "relevant conclusions."
49
51
  )
50
52
  tags = ["question_answering", "biomedical"]
51
53
 
52
- def get_instances(self, output_path: str) -> List[Instance]:
53
- data_path = "/share/pi/nigam/data/rrs-mimiciii/all"
54
- ensure_directory_exists(data_path)
54
+ def __init__(self, data_path: str):
55
+ super().__init__()
56
+ self.data_path = data_path
55
57
 
58
+ def get_instances(self, output_path: str) -> List[Instance]:
56
59
  instances: List[Instance] = []
57
60
  # Limit to zero shot setting for now
58
61
  splits: Dict[str, str] = {
@@ -64,8 +67,14 @@ class MIMICRRSScenario(Scenario):
64
67
  for data_split, split in splits.items():
65
68
  split_findings_name: str = f"{data_split}.findings.tok"
66
69
  split_impressions_name: str = f"{data_split}.impression.tok"
67
- findings_path: str = os.path.join(data_path, split_findings_name)
68
- impressions_path: str = os.path.join(data_path, split_impressions_name)
70
+ findings_path: str = os.path.join(self.data_path, split_findings_name)
71
+ impressions_path: str = os.path.join(self.data_path, split_impressions_name)
72
+ check_file_exists(
73
+ findings_path, msg=f"[MIMICRRSScenario] Required findings file not found: '{findings_path}'"
74
+ )
75
+ check_file_exists(
76
+ impressions_path, msg=f"[MIMICRRSScenario] Required impressions file not found: '{impressions_path}'"
77
+ )
69
78
  findings: List[str] = self.read_file(findings_path)
70
79
  impressions: List[str] = self.read_file(impressions_path)
71
80
  assert len(findings) == len(impressions), "Findings and impressions must have the same length"
@@ -1,9 +1,8 @@
1
- import os
2
1
  import pandas as pd
3
2
  import numpy as np
4
3
  from typing import List
5
4
 
6
- from helm.common.general import ensure_directory_exists
5
+ from helm.common.general import check_file_exists
7
6
  from helm.benchmark.scenarios.scenario import (
8
7
  Input,
9
8
  Scenario,
@@ -24,21 +23,28 @@ class MIMICIVBillingCodeScenario(Scenario):
24
23
  """
25
24
 
26
25
  name = "mimiciv_billing_code"
27
- description = "A dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes."
26
+ description = (
27
+ "MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the"
28
+ "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task"
29
+ "requires models to extract structured billing codes based on free-text clinical notes,"
30
+ "reflecting real-world hospital coding tasks for financial reimbursement."
31
+ )
28
32
  tags = ["question_answering", "biomedical"]
29
33
 
30
- def __init__(self, data_file: str):
34
+ def __init__(self, data_path: str):
31
35
  """
32
- :param data_file: Path to the mimiciv_icd10.feather file.
36
+ :param data_path: Path to the mimiciv_icd10.feather file.
33
37
  """
34
38
  super().__init__()
35
- self.data_file = data_file
39
+ self.data_path = data_path
36
40
 
37
41
  def get_instances(self, output_path: str) -> List[Instance]:
38
- ensure_directory_exists(os.path.dirname(self.data_file))
42
+ check_file_exists(
43
+ self.data_path, msg=f"[MIMICIVBilligCodeScenario] Required data file not found: '{self.data_path}'"
44
+ )
39
45
 
40
46
  # Read the preprocessed MIMIC-IV data (.feather format)
41
- df = pd.read_feather(self.data_file) # columns: ["text", "target", ...]
47
+ df = pd.read_feather(self.data_path) # columns: ["text", "target", ...]
42
48
 
43
49
  instances: List[Instance] = []
44
50
 
@@ -40,7 +40,7 @@ class MMLUProScenario(Scenario):
40
40
 
41
41
  def __init__(self, subject: str):
42
42
  super().__init__()
43
- self.subject: str = subject
43
+ self.subject: str = subject.replace("_", " ")
44
44
 
45
45
  def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
46
46
  """
@@ -31,9 +31,12 @@ class MTSamplesProceduresScenario(Scenario):
31
31
  GITHUB_DIR_URL = f"https://github.com/raulista1997/benchmarkdata/tree/{GIT_HASH}/mtsample_procedure"
32
32
  RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsample_procedure/"
33
33
 
34
- name = "mtsamples"
34
+ name = "mtsamples_procedures"
35
35
  description = (
36
- "A dataset that provides a patient note regarding an operation, with the objective to document the procedure."
36
+ "MTSamples Procedures is a benchmark composed of transcribed operative notes,"
37
+ "focused on documenting surgical procedures. Each example presents a brief patient case"
38
+ "involving a surgical intervention, and the model is tasked with generating a coherent"
39
+ "and clinically accurate procedural summary or treatment plan."
37
40
  )
38
41
  tags = ["medical", "transcription", "plan_generation"]
39
42
 
@@ -36,8 +36,9 @@ class MTSamplesReplicateScenario(Scenario):
36
36
 
37
37
  name = "mtsamples_replicate"
38
38
  description = (
39
- "A dataset of clinical notes where the model is prompted to generate "
40
- "a reasonable treatment plan for the patient based on transcribed medical reports."
39
+ "MTSamples Replicate is a benchmark that provides transcribed medical reports"
40
+ "from various specialties. It is used to evaluate a model's ability to generate clinically"
41
+ "appropriate treatment plans based on unstructured patient documentation"
41
42
  )
42
43
  tags = ["medical", "transcription", "plan_generation"]
43
44
 
@@ -188,8 +188,13 @@ class N2C2CTMatchingScenario(Scenario):
188
188
  """
189
189
 
190
190
  name = "n2c2_ct_matching"
191
- description = "A dataset that provides clinical notes and asks the model to classify whether the \
192
- patient is a valid candidate for a provided clinical trial."
191
+ description = (
192
+ "N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
193
+ "appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
194
+ "note and a trial description. The model is tasked with determining whether the patient"
195
+ "is a valid candidate for the trial. This benchmark supports automation and decision"
196
+ "support in clinical research enrollment."
197
+ )
193
198
  tags = [] # TODO
194
199
 
195
200
  POSSIBLE_ANSWER_CHOICES: List[str] = [
@@ -197,11 +202,12 @@ class N2C2CTMatchingScenario(Scenario):
197
202
  "no",
198
203
  ]
199
204
 
200
- def __init__(self, subject: str):
205
+ def __init__(self, data_path: str, subject: str):
201
206
  super().__init__()
202
207
  self.subject: str = subject # specific inclusion criterion to assess
203
- self.path_to_train_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/train/"
204
- self.path_to_test_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/test/"
208
+ self.data_path: str = data_path
209
+ self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
210
+ self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
205
211
 
206
212
  def create_prompt(self, patient: Dict[str, Any]) -> str:
207
213
  # Cast None values to empty strings during string formatting, but keep the original functions returning None
@@ -11,6 +11,7 @@ from typing import List, Optional, Tuple, Dict
11
11
 
12
12
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
13
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
+ from helm.common.local_context import LocalContext
14
15
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
15
16
  from helm.common.authentication import Authentication
16
17
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -39,7 +40,7 @@ except ModuleNotFoundError as e:
39
40
  # https://github.com/stanford-crfm/benchmarking/issues/569
40
41
  def get_test_tokenizer_service() -> TokenizerService:
41
42
  # Pointed to the default local path set in run.py (--local-path)
42
- return TokenizerService(ServerService(base_path="prod_env", root_mode=True), Authentication("test"))
43
+ return TokenizerService(LocalContext(base_path="prod_env"))
43
44
 
44
45
 
45
46
  SOLUTION_TAG: str = "solution"
@@ -0,0 +1,79 @@
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import List, Optional
5
+
6
+ import datasets
7
+ import tiktoken
8
+
9
+ from helm.benchmark.scenarios.scenario import (
10
+ CORRECT_TAG,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ Instance,
15
+ TEST_SPLIT,
16
+ Input,
17
+ )
18
+ from helm.common.general import ensure_directory_exists
19
+
20
+
21
+ class OpenAIMRCRScenario(Scenario):
22
+ """OpenAI MRCR scenario
23
+
24
+ OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
25
+ an LLM's ability to distinguish between multiple needles hidden in context. This eval is
26
+ inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
27
+
28
+ The task is as follows: The model is given a long, multi-turn, synthetically generated
29
+ conversation between user and model where the user asks for a piece of writing about a topic,
30
+ e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
31
+ are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
32
+ of one of those asks. For example, "Return the 2nd poem about tapirs".
33
+
34
+ Reference: https://huggingface.co/datasets/openai/mrcr"""
35
+
36
+ name = "openai_mrcr"
37
+ description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)." # noqa: E501
38
+ tags = ["long_context", "mrcr"]
39
+
40
+ NEEDLES_OPTIONS = [2, 4, 8]
41
+
42
+ def __init__(self, needles: int, max_num_words: Optional[int] = None):
43
+ super().__init__()
44
+ self.needles = needles
45
+ self.max_num_words = max_num_words
46
+ if needles not in self.NEEDLES_OPTIONS:
47
+ raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
48
+ self.tokenizer = tiktoken.get_encoding("o200k_base")
49
+
50
+ def count_words(self, messages: list[dict]) -> int:
51
+ return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
52
+
53
+ def get_instances(self, output_path: str) -> List[Instance]:
54
+ cache_dir = os.path.join(output_path, "data")
55
+ ensure_directory_exists(cache_dir)
56
+ dataset = datasets.load_dataset(
57
+ "openai/mrcr",
58
+ cache_dir=cache_dir,
59
+ split="train",
60
+ data_files=[f"{self.needles}needle.parquet"],
61
+ revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
62
+ )
63
+ instances = []
64
+ for idx, row in enumerate(dataset):
65
+ messages = json.loads(row["prompt"])
66
+ if self.max_num_words and self.count_words(messages) > self.max_num_words:
67
+ continue
68
+ input = Input(messages=messages)
69
+ references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
70
+ instance = Instance(
71
+ id=f"{self.needles}needle{idx}",
72
+ input=input,
73
+ references=references,
74
+ split=TEST_SPLIT,
75
+ extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
76
+ )
77
+ instances.append(instance)
78
+
79
+ return instances
@@ -125,7 +125,12 @@ class PubMedQAScenario(Scenario):
125
125
  """
126
126
 
127
127
  name = "pubmed_qa"
128
- description = "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions."
128
+ description = (
129
+ "PubMedQA is a biomedical question-answering dataset that evaluates a model's"
130
+ "ability to interpret scientific literature. It consists of PubMed abstracts paired with"
131
+ "yes/no/maybe questions derived from the content. The benchmark assesses a model's"
132
+ "capability to reason over biomedical texts and provide factually grounded answers."
133
+ )
129
134
  tags = ["question_answering", "biomedical"]
130
135
 
131
136
  POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
@@ -1,6 +1,6 @@
1
1
  import csv
2
+ import os
2
3
 
3
- from filelock import FileLock
4
4
  from typing import Dict, List
5
5
  from docx import Document
6
6
 
@@ -13,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
13
13
  Reference,
14
14
  Output,
15
15
  )
16
+ from helm.common.general import ensure_file_downloaded
16
17
 
17
18
 
18
19
  def extract_red_text_runs(document):
@@ -87,10 +88,19 @@ class RaceBasedMedScenario(Scenario):
87
88
  """
88
89
 
89
90
  name = "race_based_med"
90
- description = "A collection of LLM outputs in response to medical questions with race-based biases, \
91
- with the objective being to classify whether the output contains racially biased content."
91
+ description = (
92
+ "RaceBias is a benchmark used to evaluate language models for racially biased or"
93
+ "inappropriate content in medical question-answering scenarios. Each instance consists"
94
+ "of a medical question and a model-generated response. The task is to classify whether"
95
+ "the response contains race-based, harmful, or inaccurate content. This benchmark"
96
+ "supports research into bias detection and fairness in clinical AI systems."
97
+ )
92
98
  tags = ["knowledge", "reasoning", "biomedical"]
93
99
  POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
100
+ FILE_URL: str = (
101
+ "https://static-content.springer.com/esm/"
102
+ "art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
103
+ )
94
104
 
95
105
  def create_benchmark(self, csv_path) -> Dict[str, str]:
96
106
  data = {}
@@ -114,12 +124,12 @@ class RaceBasedMedScenario(Scenario):
114
124
  return data
115
125
 
116
126
  def get_instances(self, output_path: str) -> List[Instance]:
117
- data_path = "/share/pi/nigam/data/medhelm/race_based/race_based.csv"
118
127
  # Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
119
- word_file = "/share/pi/nigam/data/medhelm/race_based/race_based.docx"
120
- lock_path = data_path + ".lock"
121
- with FileLock(lock_path):
122
- # if not os.path.exists(data_path):
128
+ data_path = os.path.join(output_path, "race_based.csv")
129
+
130
+ if not os.path.exists(data_path):
131
+ word_file = os.path.join(output_path, "race_based.docx")
132
+ ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
123
133
  create_csv_from_word(word_file, data_path)
124
134
 
125
135
  instances: List[Instance] = []
@@ -133,7 +133,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
133
133
  input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
134
134
  # Calculate the number of tokens in the example
135
135
  total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
136
- print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
136
+ # print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
137
137
  if total_tokens + tokens_to_generate > max_seq_length:
138
138
  num_docs -= incremental
139
139
  break
@@ -142,7 +142,7 @@ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed
142
142
  if num_docs > len(docs):
143
143
  num_docs = len(docs)
144
144
  break
145
- print('Number of documents:', num_docs)
145
+ # print('Number of documents:', num_docs)
146
146
 
147
147
  # Generate samples
148
148
  for index in tqdm(range(num_samples)):
@@ -72,7 +72,7 @@ Question: {query} Answer:""" # noqa: E501
72
72
 
73
73
  class RULERHotpotQAScenario(_RULERQAScenario):
74
74
  name = "ruler_hotpotqa"
75
- description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
75
+ description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario." # noqa: E501
76
76
  tags = ["long_context", "rag"]
77
77
 
78
78
  def __init__(self, max_num_words: int):
@@ -81,7 +81,7 @@ class RULERHotpotQAScenario(_RULERQAScenario):
81
81
 
82
82
  class RULERSQuADScenario(_RULERQAScenario):
83
83
  name = "ruler_squad"
84
- description = "The SQuAD question answering scenario from RULER"
84
+ description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario." # noqa: E501
85
85
  tags = ["long_context", "rag"]
86
86
 
87
87
  def __init__(self, max_num_words: int):