crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,109 @@
1
+ from typing import List, Tuple
2
+ import os
3
+ import json
4
+
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_file_downloaded
18
+
19
+
20
+ def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
21
+ """
22
+ Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
23
+ Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
24
+
25
+ Args:
26
+ directory: Path to the directory containing the files
27
+
28
+ Returns:
29
+ List of tuples where each tuple contains (mp3_path, json_path)
30
+ """
31
+ pairs = []
32
+
33
+ # Walk through all directories and subdirectories
34
+ for root, _, files in os.walk(directory):
35
+ # Get all MP3 files in current directory
36
+ mp3_files = [f for f in files if f.endswith(".mp3")]
37
+
38
+ for mp3_file in mp3_files:
39
+ base_name = os.path.splitext(mp3_file)[0]
40
+ json_file = f"{base_name}.json"
41
+
42
+ # Check if corresponding JSON file exists in the same directory
43
+ if json_file in files:
44
+ mp3_path = os.path.join(root, mp3_file)
45
+ json_path = os.path.join(root, json_file)
46
+ pairs.append((mp3_path, json_path))
47
+
48
+ return pairs
49
+
50
+
51
+ class UltraSuiteDisorderSymptomsScenario(Scenario):
52
+ """
53
+ A scenario identifying features of speech disorders within the provided audio.
54
+ The audio files contain speech from children, potentially with an adult present.
55
+ """
56
+
57
+ name = "speech_disorder"
58
+ description = "A scenario for evaluating speech disorders in children"
59
+ tags = ["audio", "classification", "speech_disorder"]
60
+ HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
61
+
62
+ def get_instruction(self, words: str) -> str:
63
+ prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
64
+
65
+ return prompt
66
+
67
+ def get_instances(self, output_path: str) -> List[Instance]:
68
+ """
69
+ Create instances from the audio files and their corresponding JSON annotations.
70
+ The data directory should contain:
71
+ - Audio files (e.g., .mp3)
72
+ - A JSON file with annotations containing 'answer' field
73
+ """
74
+ print(f"Downloading dataset from {UltraSuiteDisorderSymptomsScenario.HF_MAPPING_URL} to {output_path}")
75
+ ensure_file_downloaded(source_url=UltraSuiteDisorderSymptomsScenario.HF_MAPPING_URL, target_path=output_path)
76
+
77
+ instances: List[Instance] = []
78
+ split: str = TEST_SPLIT
79
+
80
+ # Find all pairs of audio and JSON files
81
+ pairs = find_audio_json_pairs(output_path)
82
+
83
+ for audio_path, json_path in tqdm(pairs):
84
+
85
+ # Load the annotation
86
+ with open(json_path, "r") as f:
87
+ annotation = json.load(f)
88
+
89
+ # Get the correct answer and convert to label
90
+ if "disorder_symptom" not in annotation or "transcription" not in annotation:
91
+ continue
92
+ label = annotation["disorder_symptom"]
93
+ prompt = annotation["transcription"]
94
+ # Create references for each option
95
+ references: List[Reference] = []
96
+ for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
97
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
98
+ references.append(reference)
99
+
100
+ # Create the input with audio and instruction
101
+ content = [
102
+ MediaObject(content_type="audio/mpeg", location=audio_path),
103
+ MediaObject(content_type="text/plain", text=self.get_instruction(prompt)),
104
+ ]
105
+
106
+ input = Input(multimedia_content=MultimediaObject(content))
107
+ instances.append(Instance(input=input, references=references, split=split))
108
+
109
+ return instances
@@ -46,6 +46,10 @@ class VocalSoundScenario(Scenario):
46
46
  description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
47
47
  tags: List[str] = ["audio", "classification"]
48
48
 
49
+ def __init__(self, sound: str) -> None:
50
+ super().__init__()
51
+ self._sound: str = sound
52
+
49
53
  def get_instances(self, output_path: str) -> List[Instance]:
50
54
  instances: List[Instance] = []
51
55
  down_loading_path = os.path.join(output_path, "download")
@@ -53,7 +57,12 @@ class VocalSoundScenario(Scenario):
53
57
  wav_save_dir = os.path.join(down_loading_path, "audio_16k")
54
58
  for file_name in tqdm(os.listdir(wav_save_dir)):
55
59
  local_audio_path: str = os.path.join(wav_save_dir, file_name)
56
- if not file_name.endswith(".wav") or is_invalid_audio_file(local_audio_path):
60
+ if (
61
+ not file_name.endswith(".wav")
62
+ or is_invalid_audio_file(local_audio_path)
63
+ # Skip this problematic file
64
+ or file_name == "m0083_0_sneeze.wav"
65
+ ):
57
66
  continue
58
67
 
59
68
  input = Input(
@@ -61,9 +70,14 @@ class VocalSoundScenario(Scenario):
61
70
  )
62
71
 
63
72
  answer: str = file_name.split("_")[-1].split(".")[0]
73
+ if answer.lower() != self._sound:
74
+ continue
75
+
64
76
  if answer == "throatclearing":
65
77
  answer = "throat clearing"
66
78
 
67
79
  references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
68
80
  instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
81
+
82
+ assert len(instances) > 0, f"No instances found for sound: {self._sound}"
69
83
  return instances
@@ -40,8 +40,7 @@ class VoxCeleb2Scenario(Scenario):
40
40
  "https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
41
41
  )
42
42
  IDENTITY_INSTRUCTION = (
43
- "Listen to the audio and take your best guess to determine if the two speakers are the same person. "
44
- "Give just the letter of your answer and nothing else."
43
+ "Listen to the audio and take your best guess to determine if the two speakers are the same person."
45
44
  )
46
45
 
47
46
  name = "voxceleb2"
@@ -12,7 +12,7 @@ from helm.benchmark.scenarios.scenario import (
12
12
  Output,
13
13
  )
14
14
  from helm.common.general import ensure_directory_exists
15
- from helm.common.hierarchical_logger import hlog
15
+ from helm.common.hierarchical_logger import hwarn
16
16
 
17
17
 
18
18
  class AutoBencherCapabilitiesScenario(Scenario):
@@ -61,7 +61,7 @@ class AutoBencherCapabilitiesScenario(Scenario):
61
61
  # References are category ID, followed by level 2, 3 and 4 category names.
62
62
  references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
63
63
  if row["gold_answer"] is None:
64
- hlog(f"WARNING: Row had no gold_answer: {row}")
64
+ hwarn(f"Row had no gold_answer: {row}")
65
65
  continue
66
66
  instance = Instance(input=input, references=references, split=TEST_SPLIT)
67
67
  instances.append(instance)
@@ -1,8 +1,7 @@
1
- import os
2
1
  import pandas as pd
3
2
  from typing import List
4
3
 
5
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
6
5
  from helm.benchmark.scenarios.scenario import (
7
6
  Input,
8
7
  Scenario,
@@ -61,28 +60,30 @@ Clinical Note:
61
60
 
62
61
  class CHWCarePlanScenario(Scenario):
63
62
  """
64
- A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
63
+ A scenario for a dataset containing free form text of a clinical health worker care plan, with the
64
+ associated goal being to restructure that text into a given format.
65
65
 
66
66
  - Input: The clinical note (column "MO Note").
67
67
  - Output: The clinical note (column "MO Note"). We will use this note as the reference for entailment.
68
68
  """
69
69
 
70
70
  name = "chw_care_plan"
71
- description = "A dataset containing free form text of a clinical health worker care plan, with the \
72
- associated goal being to restructure that text into a given format."
71
+ description = (
72
+ "NoteExtract is a benchmark that focuses on the structured extraction of information"
73
+ "from free-form clinical text. It provides care plan notes authored by health workers"
74
+ "and evaluates a model's ability to convert them into a predefined structured format,"
75
+ "such as fields for Chief Complaint and History of Present Illness. The benchmark"
76
+ "emphasizes faithful extraction without hallucination or inference."
77
+ )
73
78
  tags = ["question_answering", "biomedical"]
74
79
 
75
- def __init__(self):
76
- """
77
- :param data_file: Path to the mimiciv_icd10.feather file.
78
- """
80
+ def __init__(self, data_path: str):
79
81
  super().__init__()
80
- self.data_file = "/share/pi/nigam/datasets/CHW_Dataset.csv"
82
+ self.data_path = data_path
81
83
 
82
84
  def get_instances(self, output_path: str) -> List[Instance]:
83
- ensure_directory_exists(os.path.dirname(self.data_file))
84
-
85
- df = pd.read_csv(self.data_file) # columns: ["text", "target", ...]
85
+ check_file_exists(self.data_path, msg=f"[CHWCarePlanScenario] Required data file not found: '{self.data_path}'")
86
+ df = pd.read_csv(self.data_path) # columns: ["text", "target", ...]
86
87
 
87
88
  instances: List[Instance] = []
88
89
 
@@ -2,7 +2,7 @@ import os
2
2
  import pandas as pd
3
3
  from typing import List
4
4
 
5
- from helm.common.general import ensure_directory_exists
5
+ from helm.common.general import check_file_exists
6
6
  from helm.benchmark.scenarios.scenario import (
7
7
  Input,
8
8
  Scenario,
@@ -76,16 +76,21 @@ class CLEARScenario(Scenario):
76
76
  "unemployment": "unemployment",
77
77
  }
78
78
 
79
- def __init__(self, condition: str):
79
+ def __init__(self, condition: str, data_path: str):
80
80
  """Initialize the scenario with a specific medical condition"""
81
81
  super().__init__()
82
82
 
83
83
  if condition not in self.CONDITIONS:
84
84
  raise ValueError(f"Condition '{condition}' not supported. Available conditions: {self.CONDITIONS}")
85
-
85
+ self.data_path = data_path
86
86
  self.condition = condition
87
87
  self.name = f"clear_{condition}"
88
- self.description = f"A dataset for evaluating {self.CONDITION_PROMPTS[condition]} detection from patient notes with yes/no/maybe classifications." # noqa: E501
88
+ self.description = (
89
+ "CLEAR is a benchmark designed to evaluate models on their ability to detect medical"
90
+ "conditions from patient notes using categorical responses. Each instance consists of"
91
+ "a clinical note and a target condition, requiring the model to classify the patient's"
92
+ "history as either affirmative, negative, or uncertain."
93
+ ) # noqa: E501
89
94
  self.tags = ["classification", "biomedical", condition.replace("_", "-")]
90
95
 
91
96
  def get_answer_choices(self) -> List[str]:
@@ -95,9 +100,8 @@ class CLEARScenario(Scenario):
95
100
 
96
101
  def get_instances(self, output_path: str) -> List[Instance]:
97
102
  """Load and process the data for the specified conditon."""
98
- data_dir = "/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/"
99
- excel_path = os.path.join(data_dir, f"{self.condition}.xlsx")
100
- ensure_directory_exists(os.path.dirname(excel_path))
103
+ excel_path = os.path.join(self.data_path, f"{self.condition}.xlsx")
104
+ check_file_exists(excel_path, msg=f"[CLEARScenario] Required data file not found: '{excel_path}'")
101
105
 
102
106
  df = pd.read_excel(excel_path)
103
107
 
@@ -1,5 +1,5 @@
1
1
  from typing import List
2
- from helm.common.general import ensure_directory_exists
2
+ from helm.common.general import check_file_exists
3
3
  from helm.benchmark.scenarios.scenario import (
4
4
  Input,
5
5
  Scenario,
@@ -21,26 +21,34 @@ def file_preprocessing(data_path: str, task_objective: str) -> pd.DataFrame:
21
21
  data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
22
22
  """
23
23
  # Load the first CSV file
24
- df_diagnosis = pd.read_csv(
25
- f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz", compression="gzip", keep_default_na=False
24
+ diagnosis_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz"
25
+ check_file_exists(
26
+ diagnosis_path, msg=f"[DischargeMeScenario] Required diagnosis file not found: '{diagnosis_path}'"
26
27
  )
27
- df_discharge = pd.read_csv(
28
- f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz", compression="gzip", keep_default_na=False
28
+ discharge_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz"
29
+ check_file_exists(
30
+ discharge_path, msg=f"[DischargeMeScenario] Required discharge file not found: '{discharge_path}'"
29
31
  )
32
+ target_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz"
33
+ check_file_exists(target_path, msg=f"[DischargeMeScenario] Required target file not found: '{target_path}'")
34
+ radiology_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz"
35
+ check_file_exists(
36
+ radiology_path, msg=f"[DischargeMeScenario] Required radiology file not found: '{radiology_path}'"
37
+ )
38
+ ed_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz"
39
+ check_file_exists(ed_path, msg=f"[DischargeMeScenario] Required ed file not found: '{ed_path}'")
40
+ triage_path = f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz"
41
+ check_file_exists(triage_path, msg=f"[DischargeMeScenario] Required triage file not found: '{triage_path}'")
42
+ df_diagnosis = pd.read_csv(diagnosis_path, compression="gzip", keep_default_na=False)
43
+ df_discharge = pd.read_csv(discharge_path, compression="gzip", keep_default_na=False)
30
44
  df_target = pd.read_csv(
31
- f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz",
45
+ target_path,
32
46
  compression="gzip",
33
47
  keep_default_na=False,
34
48
  )
35
- df_radiology = pd.read_csv(
36
- f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz", compression="gzip", keep_default_na=False
37
- )
38
- df_ed = pd.read_csv(
39
- f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz", compression="gzip", keep_default_na=False
40
- )
41
- df_triage = pd.read_csv(
42
- f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz", compression="gzip", keep_default_na=False
43
- )
49
+ df_radiology = pd.read_csv(radiology_path, compression="gzip", keep_default_na=False)
50
+ df_ed = pd.read_csv(ed_path, compression="gzip", keep_default_na=False)
51
+ df_triage = pd.read_csv(triage_path, compression="gzip", keep_default_na=False)
44
52
  df_diagnosis_triage = pd.merge(
45
53
  df_diagnosis, df_triage, on="subject_id", how="inner", suffixes=("_df_diagnosis", "_df_triage")
46
54
  )
@@ -113,16 +121,23 @@ class DischargeMeScenario(Scenario):
113
121
  """
114
122
 
115
123
  name = "dischargeme"
116
- description = "DischargeMe is a discharge instruction generation dataset and brief hospital course generation \
117
- dataset collected from MIMIC-IV data, consindering only the discharge text as well as the radiology report text."
124
+ description = (
125
+ "DischargeMe is a benchmark designed to evaluate clinical text generation. It pairs"
126
+ "discharge summaries and radiology reports from MIMIC-IV with generation tasks"
127
+ "such as writing discharge instructions or summarizing the brief hospital course. The"
128
+ "benchmark assesses a model's ability to generate patient-facing documentation that is"
129
+ "complete, empathetic, and clinically accurate."
130
+ )
118
131
  tags = ["biomedical"]
119
132
 
133
+ def __init__(self, data_path: str):
134
+ super().__init__()
135
+ self.data_path = data_path
136
+
120
137
  def get_instances(self, output_path: str) -> List[Instance]:
121
- data_path = "/share/pi/nigam/data/physionet.org"
122
- ensure_directory_exists(data_path)
123
138
  instances: List[Instance] = []
124
- df_bhc = file_preprocessing(data_path, "brief_hospital_course")
125
- df_di = file_preprocessing(data_path, "discharge_instructions")
139
+ df_bhc = file_preprocessing(self.data_path, "brief_hospital_course")
140
+ df_di = file_preprocessing(self.data_path, "discharge_instructions")
126
141
 
127
142
  for i in range(df_bhc.shape[0]):
128
143
  prompt_bhc = create_prompt(
@@ -36,7 +36,13 @@ class EhrSqlScenario(Scenario):
36
36
  )
37
37
 
38
38
  name = "ehr_sql"
39
- description = "Given a natural language instruction, generate an SQL query that would be used in clinical research."
39
+ description = (
40
+ "EHRSQL is a benchmark designed to evaluate models on generating structured queries"
41
+ "for clinical research. Each example includes a natural language question and a database"
42
+ "schema, and the task is to produce an SQL query that would return the correct result"
43
+ "for a biomedical research objective. This benchmark assesses a model's understanding"
44
+ "of medical terminology, data structures, and query construction."
45
+ )
40
46
  tags = ["sql", "medical", "reasoning"]
41
47
 
42
48
  def setup_database(self, output_path: str) -> str:
@@ -3,12 +3,11 @@ import os
3
3
  import pandas as pd
4
4
  import tiktoken
5
5
 
6
- from filelock import FileLock
7
6
  from functools import partial
8
7
  from tqdm import tqdm
9
8
  from typing import Any, Dict, List, Optional, Mapping
10
9
 
11
- from helm.common.general import ensure_directory_exists
10
+ from helm.common.general import check_file_exists, ensure_directory_exists
12
11
  from helm.benchmark.scenarios.scenario import (
13
12
  TEST_SPLIT,
14
13
  Input,
@@ -1411,7 +1410,10 @@ class EHRSHOTScenario(Scenario):
1411
1410
 
1412
1411
  name = "ehrshot"
1413
1412
  description = (
1414
- "A dataset given a patient record of EHR codes, classifying if an event will occur at a future date or not."
1413
+ "EHRSHOT is a benchmark designed to evaluate a model's ability to predict future"
1414
+ "clinical events using structured EHR data. Each instance contains a patient's"
1415
+ "historical EHR data and a forward-looking clinical question about whether a particular"
1416
+ "diagnosis, lab result, or hospital event will occur."
1415
1417
  )
1416
1418
  tags = [] # TODO
1417
1419
 
@@ -1420,24 +1422,32 @@ class EHRSHOTScenario(Scenario):
1420
1422
  "no",
1421
1423
  ]
1422
1424
 
1423
- def __init__(self, subject: str, max_length: Optional[int] = None):
1425
+ def __init__(self, subject: str, data_path: str, max_length: Optional[int] = None):
1424
1426
  super().__init__()
1425
1427
  self.subject: str = subject # same as "task" or "labeling_function"
1426
- self.path_to_meds_dir: str = "/share/pi/nigam/data/medhelm/ehrshot/meds/"
1427
- self.path_to_tmp_dir: str = "/share/pi/nigam/data/medhelm/ehrshot/prompts/"
1428
1428
  self.max_length = max_length
1429
+ self.data_path = data_path
1429
1430
 
1430
- def create_benchmark(self, n_procs: int = 4) -> Dict[str, str]:
1431
+ def create_benchmark(self, output_path: str, n_procs: int = 4) -> Dict[str, str]:
1431
1432
  """Loads the MEDS dataset and converts it to prompts"""
1432
-
1433
1433
  # Load MEDS EHRSHOT patient timelines
1434
- df_data = pd.read_parquet(os.path.join(self.path_to_meds_dir, "data/data.parquet"))
1435
- df_splits = pd.read_parquet(os.path.join(self.path_to_meds_dir, "metadata/subject_splits.parquet"))
1436
-
1434
+ data_parquet_path = os.path.join(self.data_path, "data/data.parquet")
1435
+ check_file_exists(
1436
+ data_parquet_path, msg=f"[EHRSHOTScenario] Required parquet data file not found: '{data_parquet_path}'"
1437
+ )
1438
+ splits_parquet_path = os.path.join(self.data_path, "metadata/subject_splits.parquet")
1439
+ check_file_exists(
1440
+ splits_parquet_path, msg=f"[EHRSHOTScenario] Required splits file not found: '{splits_parquet_path}'"
1441
+ )
1442
+ df_data = pd.read_parquet(data_parquet_path)
1443
+ df_splits = pd.read_parquet(splits_parquet_path)
1437
1444
  # Load MEDS EHRSHOT labels
1438
- tasks = sorted(os.listdir(os.path.join(self.path_to_meds_dir, "labels")))
1445
+ tasks = sorted(os.listdir(os.path.join(self.data_path, "labels")))
1439
1446
  for t in tasks:
1440
- path_to_labels: str = os.path.join(self.path_to_meds_dir, "labels", t, "labels.parquet")
1447
+ path_to_labels: str = os.path.join(self.data_path, "labels", t, "labels.parquet")
1448
+ check_file_exists(
1449
+ path_to_labels, msg=f"[EHRSHOTScenario] Required labels file not found: '{path_to_labels}'"
1450
+ )
1441
1451
  if t != self.subject or not os.path.exists(path_to_labels):
1442
1452
  continue
1443
1453
  df_labels = pd.read_parquet(path_to_labels)
@@ -1470,18 +1480,16 @@ class EHRSHOTScenario(Scenario):
1470
1480
  df_labels["prompt"] = prompts
1471
1481
 
1472
1482
  # Save to parquet
1473
- path_to_output_dir: str = os.path.join(self.path_to_tmp_dir, self.subject)
1483
+ path_to_output_dir: str = os.path.join(output_path, self.subject)
1474
1484
  ensure_directory_exists(path_to_output_dir)
1475
1485
  df_labels.to_parquet(os.path.join(path_to_output_dir, "medhelm_prompts.parquet"))
1476
1486
  return {"status": "success"}
1477
1487
 
1478
1488
  def get_instances(self, output_path: str) -> List[Instance]:
1479
- path_to_input_csv: str = os.path.join(self.path_to_tmp_dir, self.subject, "medhelm_prompts.parquet")
1480
- lock_path = path_to_input_csv + ".lock"
1481
- with FileLock(lock_path):
1482
- if not os.path.exists(path_to_input_csv):
1483
- print(f"Creating benchmark from SCRATCH for {self.subject}...")
1484
- self.create_benchmark() # Create benchmark from scratch
1489
+ path_to_input_csv: str = os.path.join(output_path, self.subject, "medhelm_prompts.parquet")
1490
+ if not os.path.exists(path_to_input_csv):
1491
+ print(f"Creating benchmark from SCRATCH for {self.subject}...")
1492
+ self.create_benchmark(output_path=output_path) # Create benchmark from scratch
1485
1493
 
1486
1494
  # Load data for this task
1487
1495
  df = pd.read_parquet(path_to_input_csv)
@@ -1509,38 +1517,3 @@ class EHRSHOTScenario(Scenario):
1509
1517
  )
1510
1518
 
1511
1519
  return instances
1512
-
1513
-
1514
- if __name__ == "__main__":
1515
- # Generate statistics on prompts
1516
- from transformers import AutoTokenizer
1517
-
1518
- tokenizer = AutoTokenizer.from_pretrained("gpt2")
1519
- tqdm.pandas()
1520
- n_procs: int = 10
1521
-
1522
- os.makedirs("./ehrshot_stats", exist_ok=True)
1523
- for t in TASK_FULL_NAMES.keys():
1524
- # Skip if already exists
1525
- if os.path.exists(f"./ehrshot_stats/{t}.txt"):
1526
- print(f"Skipping {t} because it already exists")
1527
- continue
1528
-
1529
- # Create benchmark
1530
- scenario = EHRSHOTScenario(subject=t)
1531
- scenario.create_benchmark(n_procs=n_procs)
1532
- instances = scenario.get_instances("test.csv")
1533
-
1534
- # Calculate prompt token stats
1535
- path_to_input_csv = os.path.join(scenario.path_to_tmp_dir, scenario.subject, "medhelm_prompts.parquet")
1536
- df = pd.read_parquet(path_to_input_csv)
1537
- df["prompt_n_tokens"] = df["prompt"].progress_apply(lambda x: len(tokenizer.encode(x)))
1538
- with open(f"./ehrshot_stats/{t}.txt", "w") as f:
1539
- f.write("-" * 100 + "\n")
1540
- f.write(f"Task: {t}\n")
1541
- f.write(f"# of instances: {len(instances)}\n")
1542
- f.write(f"# of positives: {df['boolean_value'].sum()}\n")
1543
- f.write(f"Size of splits:\n{df['split'].value_counts()}\n")
1544
- f.write(f"# tokens per prompt:\n{df['prompt_n_tokens'].describe()}\n")
1545
- f.write("-" * 100 + "\n")
1546
- df.to_parquet(os.path.join(scenario.path_to_tmp_dir, scenario.subject, "medhelm_prompts.parquet"))
@@ -2,7 +2,7 @@ from collections import defaultdict
2
2
  from dataclasses import dataclass, field, replace
3
3
  from functools import cached_property
4
4
  from typing import List, Optional
5
- from helm.common.hierarchical_logger import hlog
5
+ from helm.common.hierarchical_logger import hwarn
6
6
 
7
7
  import dacite
8
8
  import re
@@ -111,7 +111,7 @@ def validate_grammar(grammar: Grammar):
111
111
  # Make sure all categories are defined
112
112
  for category in expansion.categories:
113
113
  if category not in grammar.category_to_rules:
114
- hlog(f"WARNING: Category {category} is not defined")
114
+ hwarn(f"Category {category} is not defined")
115
115
 
116
116
 
117
117
  def read_grammar(path: str) -> Grammar:
@@ -57,7 +57,12 @@ class HeadQAScenario(Scenario):
57
57
  SKIP_TEXTQA: bool = False
58
58
 
59
59
  name = "head_qa"
60
- description = "A collection of biomedical multiple-choice questions for testing medical knowledge."
60
+ description = (
61
+ "HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to"
62
+ "evaluate a model's medical knowledge and reasoning. Each instance presents a clinical"
63
+ "or scientific question with four answer options, requiring the model to select the most"
64
+ "appropriate answer."
65
+ )
61
66
  tags = ["question_answering", "biomedical", "medicine"]
62
67
 
63
68
  def __init__(self, language: str = "en", category: Optional[str] = None):
@@ -0,0 +1,85 @@
1
+ import os
2
+ import re
3
+ from typing import List
4
+
5
+ from datasets import load_dataset, Features, Value, Sequence, Dataset
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ TEST_SPLIT,
15
+ )
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class InfiniteBenchEnQAScenario(Scenario):
20
+ """InfiniteBench En.QA
21
+
22
+ InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
23
+ understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
24
+ InfiniteBench that requires models to perform open-form question answering on questions that necessitate
25
+ long-range dependency and reasoning, beyond simple short passage retrieval.
26
+ """
27
+
28
+ name = "infinite_bench_en_qa"
29
+ description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
30
+ tags = ["question_answering"]
31
+
32
+ def __init__(self, max_num_words: int):
33
+ self.max_num_words = max_num_words
34
+ super().__init__()
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ # Get InfiniteBench from HuggingFace
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+
41
+ # Define the features schema
42
+ ft = Features(
43
+ {
44
+ "id": Value("int64"),
45
+ "context": Value("string"),
46
+ "input": Value("string"),
47
+ "answer": Sequence(Value("string")),
48
+ "options": Sequence(Value("string")),
49
+ }
50
+ )
51
+
52
+ # Load the dataset with the specified features
53
+ dataset = load_dataset(
54
+ "xinrongzhang2022/InfiniteBench",
55
+ split="longbook_qa_eng",
56
+ features=ft,
57
+ revision="90f0394333616266d9fe85824ceaf505093cbaa5",
58
+ )
59
+
60
+ assert isinstance(dataset, Dataset)
61
+
62
+ def count_words(text: str) -> int:
63
+ return len(re.split(r"\s+", text.strip()))
64
+
65
+ dataset = dataset.filter(
66
+ lambda example: count_words(example["context"])
67
+ + count_words(example["input"])
68
+ + sum(count_words(option) for option in example["options"])
69
+ <= self.max_num_words
70
+ )
71
+
72
+ # Read all instances
73
+ instances: List[Instance] = []
74
+ for row in dataset:
75
+ id = row["id"]
76
+ input = Input(text=row["context"] + "\n\n" + row["input"])
77
+ instance = Instance(
78
+ id=id,
79
+ input=input,
80
+ references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
81
+ split=TEST_SPLIT,
82
+ )
83
+ instances.append(instance)
84
+
85
+ return instances