crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,152 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from datasets import load_dataset, Dataset
4
+ from helm.common.hierarchical_logger import htrack_block
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ VALID_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MELTTranslationScenario(Scenario):
19
+ name = "melt_translation"
20
+ description = "Machine Translation scenario."
21
+ tags = ["machine_translation"]
22
+
23
+ def __init__(
24
+ self,
25
+ dataset_name: str,
26
+ revision: str,
27
+ source_language: str,
28
+ target_language: str,
29
+ subset: Optional[str] = None,
30
+ splits: Optional[Dict[str, str]] = None,
31
+ ):
32
+ """Initializes the question answering scenario.
33
+
34
+ Args:
35
+ dataset_name: The name of the dataset.
36
+ revision: The revision of the dataset to use.
37
+ source_language: The source language to use.
38
+ target_language: The target language to use.
39
+ subset: The subset of the dataset to use. Defaults to "".
40
+ splits: The splits to use for the dataset. Defaults to None.
41
+ """
42
+ super().__init__()
43
+ self.MAX_TRAIN_INSTANCES = 20_000
44
+ valid_languages = set(["vi", "en"])
45
+ self.dataset_name = dataset_name
46
+ self.subset = subset
47
+ self.revision = revision
48
+ self.splits = splits
49
+ self.source_language = source_language
50
+ self.target_language = target_language
51
+ if self.source_language not in valid_languages or self.target_language not in valid_languages:
52
+ raise ValueError("Supported languages: vi, en.")
53
+ if self.source_language == self.target_language:
54
+ raise ValueError("The source language and the target language should be different.")
55
+ if self.source_language != "en" and self.target_language != "en":
56
+ raise ValueError("One of the languages should be English.")
57
+
58
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
59
+ """
60
+ Helper for generating instances for a split.
61
+ Args:
62
+ splits (dict): Which splits to partition the data into.
63
+ Returns:
64
+ List[Instance]: Instances from the file for the specified split.
65
+ """
66
+ with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
67
+ hf_dataset: Any = load_dataset(
68
+ self.dataset_name,
69
+ self.subset,
70
+ revision=self.revision,
71
+ trust_remote_code=True,
72
+ )
73
+
74
+ instances: List[Instance] = []
75
+
76
+ for dataset_split_name, helm_split_name in splits.items():
77
+ if helm_split_name == TRAIN_SPLIT:
78
+ hf_dataset[dataset_split_name] = hf_dataset[dataset_split_name].shuffle(seed=42)[
79
+ : self.MAX_TRAIN_INSTANCES
80
+ ]
81
+ hf_dataset[dataset_split_name] = Dataset.from_dict(hf_dataset[dataset_split_name])
82
+
83
+ for example in hf_dataset[dataset_split_name]:
84
+ source_sentence = example[self.source_language]
85
+ target_sentence = example[self.target_language]
86
+ instances.append(
87
+ Instance(
88
+ input=Input(text=source_sentence),
89
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
90
+ split=helm_split_name,
91
+ )
92
+ )
93
+ return instances
94
+
95
+ def get_instances(self, output_path: str) -> List[Instance]:
96
+ if self.splits is None:
97
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
98
+ else:
99
+ splits = {}
100
+ if "train" in self.splits:
101
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
102
+ if "validation" in self.splits:
103
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
104
+ if "test" in self.splits:
105
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
106
+
107
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
108
+ return instances
109
+
110
+
111
+ class MELTTranslationOPUS100Scenario(MELTTranslationScenario):
112
+ """
113
+ Scenario for the OPUS100 dataset.
114
+ """
115
+
116
+ name = "melt_translation_opus100"
117
+ description = "OPUS100 dataset for machine translation."
118
+ tags = ["machine_translation"]
119
+
120
+ def __init__(self, **kwargs):
121
+ super().__init__(
122
+ dataset_name="vietgpt/opus100_envi",
123
+ revision="45df06fb0b31edc882d7c8d34389261f995e5208",
124
+ splits={
125
+ TRAIN_SPLIT: "train",
126
+ VALID_SPLIT: "validation",
127
+ TEST_SPLIT: "test",
128
+ },
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ class MELTTranslationPhoMTScenario(MELTTranslationScenario):
134
+ """
135
+ Scenario for the PhoMT dataset.
136
+ """
137
+
138
+ name = "melt_translation_phomt"
139
+ description = "PhoMT dataset for machine translation."
140
+ tags = ["machine_translation"]
141
+
142
+ def __init__(self, **kwargs):
143
+ super().__init__(
144
+ dataset_name="ura-hcmut/PhoMT",
145
+ revision="74386685db01dc038860ff0a90d9f5fbde284bf7",
146
+ splits={
147
+ TRAIN_SPLIT: "train",
148
+ VALID_SPLIT: "validation",
149
+ TEST_SPLIT: "test",
150
+ },
151
+ **kwargs,
152
+ )
@@ -9,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
9
9
  PassageQuestionInput,
10
10
  Output,
11
11
  )
12
+ from helm.common.general import check_file_exists
12
13
 
13
14
 
14
15
  class MentalHealthScenario(Scenario):
@@ -48,10 +49,19 @@ class MentalHealthScenario(Scenario):
48
49
  """
49
50
 
50
51
  name = "mental_health"
51
- description = "A dataset containing a counselor and mental health patient conversation, where the objective is to \
52
- generate an empathetic counselor response."
52
+ description = (
53
+ "MentalHealth is a benchmark focused on evaluating empathetic communication in"
54
+ "mental health counseling. It includes simulated conversations between patients"
55
+ "and counselors, where the task is to generate compassionate and appropriate counselor"
56
+ "responses. The benchmark assesses a model's ability to support patients emotionally"
57
+ "and meaningfully engage in therapeutic conversations."
58
+ )
53
59
  tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
54
60
 
61
+ def __init__(self, data_path: str):
62
+ super().__init__()
63
+ self.data_path = data_path
64
+
55
65
  def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
56
66
  """
57
67
  Process the dialogue data into evaluation instances.
@@ -102,9 +112,10 @@ class MentalHealthScenario(Scenario):
102
112
  Returns:
103
113
  List[Instance]: List of processed instances for evaluation
104
114
  """
105
- # Load the processed dialogue data
106
- data_path = "/share/pi/nigam/data/medhelm/mental_health/processed_dialogues.csv"
107
- dialogue_data = pd.read_csv(data_path)
115
+ check_file_exists(
116
+ self.data_path, msg=f"[MentalHealthScenario] Required data file not found: '{self.data_path}'"
117
+ )
118
+ dialogue_data = pd.read_csv(self.data_path)
108
119
 
109
120
  # Process into instances
110
121
  instances = self.process_dialogue_data(dialogue_data)
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  from typing import Dict, List
3
3
 
4
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
5
5
  from helm.benchmark.scenarios.scenario import (
6
6
  Input,
7
7
  Scenario,
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
 
15
15
 
16
16
  class MIMICBHCScenario(Scenario):
17
- """
17
+ r"""
18
18
  MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
19
19
  course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
20
20
 
@@ -59,15 +59,20 @@ class MIMICBHCScenario(Scenario):
59
59
 
60
60
  name = "mimic_bhc"
61
61
  description = (
62
- "A summarization task using a curated collection of preprocessed discharge notes"
63
- " paired with their corresponding brief hospital course (BHC) summaries."
62
+ "MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief"
63
+ "Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV,"
64
+ "each paired with its corresponding BHC summary. The benchmark evaluates a model's"
65
+ "ability to condense detailed clinical information into accurate, concise summaries that"
66
+ "reflect the patient's hospital stay."
64
67
  )
65
68
  tags = ["summarization", "biomedical"]
66
69
 
70
+ def __init__(self, data_path: str):
71
+ super().__init__()
72
+ self.data_path = data_path
73
+
67
74
  def get_instances(self, output_path: str) -> List[Instance]:
68
- data_path = "/share/pi/nigam/data/bhc-mimiciv/"
69
- ensure_directory_exists(data_path)
70
- data_path = data_path + "mimic_iv_bhc.json"
75
+ check_file_exists(self.data_path, msg=f"[MIMICBHCScenario] Required data file not found: '{self.data_path}'")
71
76
 
72
77
  instances: List[Instance] = []
73
78
  # Limit to zero shot setting for now
@@ -77,7 +82,7 @@ class MIMICBHCScenario(Scenario):
77
82
  "test": TEST_SPLIT,
78
83
  }
79
84
 
80
- with open(data_path, "r") as f:
85
+ with open(self.data_path, "r") as f:
81
86
  data = [json.loads(line) for line in f]
82
87
 
83
88
  for data_split, split in splits.items():
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import Dict, List
3
3
 
4
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
5
5
  from helm.benchmark.scenarios.scenario import (
6
6
  Input,
7
7
  Scenario,
@@ -44,15 +44,18 @@ class MIMICRRSScenario(Scenario):
44
44
 
45
45
  name = "mimic_rrs"
46
46
  description = (
47
- "A dataset containing radiology reports with findings sections from MIMIC-III paired with"
48
- " their corresponding impression sections, used for generating radiology report summaries."
47
+ "MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III"
48
+ "database. It contains pairs of 'Findings' and 'Impression' sections, enabling evaluation"
49
+ "of a model's ability to summarize diagnostic imaging observations into concise, clinically"
50
+ "relevant conclusions."
49
51
  )
50
52
  tags = ["question_answering", "biomedical"]
51
53
 
52
- def get_instances(self, output_path: str) -> List[Instance]:
53
- data_path = "/share/pi/nigam/data/rrs-mimiciii/all"
54
- ensure_directory_exists(data_path)
54
+ def __init__(self, data_path: str):
55
+ super().__init__()
56
+ self.data_path = data_path
55
57
 
58
+ def get_instances(self, output_path: str) -> List[Instance]:
56
59
  instances: List[Instance] = []
57
60
  # Limit to zero shot setting for now
58
61
  splits: Dict[str, str] = {
@@ -64,8 +67,14 @@ class MIMICRRSScenario(Scenario):
64
67
  for data_split, split in splits.items():
65
68
  split_findings_name: str = f"{data_split}.findings.tok"
66
69
  split_impressions_name: str = f"{data_split}.impression.tok"
67
- findings_path: str = os.path.join(data_path, split_findings_name)
68
- impressions_path: str = os.path.join(data_path, split_impressions_name)
70
+ findings_path: str = os.path.join(self.data_path, split_findings_name)
71
+ impressions_path: str = os.path.join(self.data_path, split_impressions_name)
72
+ check_file_exists(
73
+ findings_path, msg=f"[MIMICRRSScenario] Required findings file not found: '{findings_path}'"
74
+ )
75
+ check_file_exists(
76
+ impressions_path, msg=f"[MIMICRRSScenario] Required impressions file not found: '{impressions_path}'"
77
+ )
69
78
  findings: List[str] = self.read_file(findings_path)
70
79
  impressions: List[str] = self.read_file(impressions_path)
71
80
  assert len(findings) == len(impressions), "Findings and impressions must have the same length"
@@ -1,9 +1,8 @@
1
- import os
2
1
  import pandas as pd
3
2
  import numpy as np
4
3
  from typing import List
5
4
 
6
- from helm.common.general import ensure_directory_exists
5
+ from helm.common.general import check_file_exists
7
6
  from helm.benchmark.scenarios.scenario import (
8
7
  Input,
9
8
  Scenario,
@@ -24,21 +23,28 @@ class MIMICIVBillingCodeScenario(Scenario):
24
23
  """
25
24
 
26
25
  name = "mimiciv_billing_code"
27
- description = "A dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes."
26
+ description = (
27
+ "MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the"
28
+ "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task"
29
+ "requires models to extract structured billing codes based on free-text clinical notes,"
30
+ "reflecting real-world hospital coding tasks for financial reimbursement."
31
+ )
28
32
  tags = ["question_answering", "biomedical"]
29
33
 
30
- def __init__(self, data_file: str):
34
+ def __init__(self, data_path: str):
31
35
  """
32
- :param data_file: Path to the mimiciv_icd10.feather file.
36
+ :param data_path: Path to the mimiciv_icd10.feather file.
33
37
  """
34
38
  super().__init__()
35
- self.data_file = data_file
39
+ self.data_path = data_path
36
40
 
37
41
  def get_instances(self, output_path: str) -> List[Instance]:
38
- ensure_directory_exists(os.path.dirname(self.data_file))
42
+ check_file_exists(
43
+ self.data_path, msg=f"[MIMICIVBilligCodeScenario] Required data file not found: '{self.data_path}'"
44
+ )
39
45
 
40
46
  # Read the preprocessed MIMIC-IV data (.feather format)
41
- df = pd.read_feather(self.data_file) # columns: ["text", "target", ...]
47
+ df = pd.read_feather(self.data_path) # columns: ["text", "target", ...]
42
48
 
43
49
  instances: List[Instance] = []
44
50
 
@@ -40,7 +40,7 @@ class MMLUProScenario(Scenario):
40
40
 
41
41
  def __init__(self, subject: str):
42
42
  super().__init__()
43
- self.subject: str = subject
43
+ self.subject: str = subject.replace("_", " ")
44
44
 
45
45
  def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
46
46
  """
@@ -0,0 +1,85 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MMMLUScenario(Scenario):
19
+ """Multilingual Massive Multitask Language Understanding (MMMLU) by OpenAI
20
+
21
+ The MMLU is a widely recognized benchmark of general knowledge attained
22
+ by AI models. It covers a broad range of topics from 57 different categories,
23
+ covering elementary-level knowledge up to advanced professional subjects like
24
+ law, physics, history, and computer science.
25
+
26
+ MMMLU is a translation of MMLU’s test set into 14 languages using professional
27
+ human translators. Relying on human translators for this evaluation increases
28
+ confidence in the accuracy of the translations, especially for low-resource
29
+ languages like Yoruba.
30
+
31
+ The Massive Multitask Language Understanding benchmark from this paper:
32
+
33
+ - https://arxiv.org/pdf/2009.03300.pdf
34
+
35
+ The MMMLU dataset is from here:
36
+
37
+ - https://huggingface.co/datasets/openai/MMMLU
38
+ """
39
+
40
+ name = "mmmlu"
41
+ description = "Multilingual Massive Multitask Language Understanding"
42
+ tags = ["knowledge", "multiple_choice"]
43
+
44
+ OPTIONS = ["A", "B", "C", "D"]
45
+
46
+ def __init__(self, locale: str, subject: str):
47
+ super().__init__()
48
+ self.locale: str = locale
49
+ self.subject: str = subject
50
+
51
+ def get_instances(self, output_path: str) -> List[Instance]:
52
+ cache_dir = os.path.join(output_path, "data")
53
+ ensure_directory_exists(cache_dir)
54
+ dataset = datasets.load_dataset(
55
+ "openai/MMMLU",
56
+ self.locale,
57
+ revision="325a01dc3e173cac1578df94120499aaca2e2504",
58
+ cache_dir=cache_dir,
59
+ split="test",
60
+ )
61
+ assert isinstance(dataset, datasets.Dataset)
62
+
63
+ # Read all instances
64
+ instances: List[Instance] = []
65
+ for row_index, row in enumerate(dataset):
66
+ if self.subject != "all" and row["Subject"] != self.subject:
67
+ continue
68
+ input = Input(text=row["Question"])
69
+ references: List[Reference] = []
70
+ for option in self.OPTIONS:
71
+ references.append(
72
+ Reference(
73
+ output=Output(text=row[option]),
74
+ tags=[CORRECT_TAG] if option == row["Answer"] else [],
75
+ )
76
+ )
77
+ instance = Instance(
78
+ id=f"id{row_index}",
79
+ input=input,
80
+ references=references,
81
+ split=TEST_SPLIT,
82
+ )
83
+ instances.append(instance)
84
+
85
+ return instances
@@ -31,9 +31,12 @@ class MTSamplesProceduresScenario(Scenario):
31
31
  GITHUB_DIR_URL = f"https://github.com/raulista1997/benchmarkdata/tree/{GIT_HASH}/mtsample_procedure"
32
32
  RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsample_procedure/"
33
33
 
34
- name = "mtsamples"
34
+ name = "mtsamples_procedures"
35
35
  description = (
36
- "A dataset that provides a patient note regarding an operation, with the objective to document the procedure."
36
+ "MTSamples Procedures is a benchmark composed of transcribed operative notes,"
37
+ "focused on documenting surgical procedures. Each example presents a brief patient case"
38
+ "involving a surgical intervention, and the model is tasked with generating a coherent"
39
+ "and clinically accurate procedural summary or treatment plan."
37
40
  )
38
41
  tags = ["medical", "transcription", "plan_generation"]
39
42
 
@@ -36,8 +36,9 @@ class MTSamplesReplicateScenario(Scenario):
36
36
 
37
37
  name = "mtsamples_replicate"
38
38
  description = (
39
- "A dataset of clinical notes where the model is prompted to generate "
40
- "a reasonable treatment plan for the patient based on transcribed medical reports."
39
+ "MTSamples Replicate is a benchmark that provides transcribed medical reports"
40
+ "from various specialties. It is used to evaluate a model's ability to generate clinically"
41
+ "appropriate treatment plans based on unstructured patient documentation"
41
42
  )
42
43
  tags = ["medical", "transcription", "plan_generation"]
43
44
 
@@ -188,8 +188,13 @@ class N2C2CTMatchingScenario(Scenario):
188
188
  """
189
189
 
190
190
  name = "n2c2_ct_matching"
191
- description = "A dataset that provides clinical notes and asks the model to classify whether the \
192
- patient is a valid candidate for a provided clinical trial."
191
+ description = (
192
+ "N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
193
+ "appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
194
+ "note and a trial description. The model is tasked with determining whether the patient"
195
+ "is a valid candidate for the trial. This benchmark supports automation and decision"
196
+ "support in clinical research enrollment."
197
+ )
193
198
  tags = [] # TODO
194
199
 
195
200
  POSSIBLE_ANSWER_CHOICES: List[str] = [
@@ -197,11 +202,12 @@ class N2C2CTMatchingScenario(Scenario):
197
202
  "no",
198
203
  ]
199
204
 
200
- def __init__(self, subject: str):
205
+ def __init__(self, data_path: str, subject: str):
201
206
  super().__init__()
202
207
  self.subject: str = subject # specific inclusion criterion to assess
203
- self.path_to_train_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/train/"
204
- self.path_to_test_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/test/"
208
+ self.data_path: str = data_path
209
+ self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
210
+ self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
205
211
 
206
212
  def create_prompt(self, patient: Dict[str, Any]) -> str:
207
213
  # Cast None values to empty strings during string formatting, but keep the original functions returning None
@@ -0,0 +1,79 @@
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import List, Optional
5
+
6
+ import datasets
7
+ import tiktoken
8
+
9
+ from helm.benchmark.scenarios.scenario import (
10
+ CORRECT_TAG,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ Instance,
15
+ TEST_SPLIT,
16
+ Input,
17
+ )
18
+ from helm.common.general import ensure_directory_exists
19
+
20
+
21
+ class OpenAIMRCRScenario(Scenario):
22
+ """OpenAI MRCR scenario
23
+
24
+ OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
25
+ an LLM's ability to distinguish between multiple needles hidden in context. This eval is
26
+ inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
27
+
28
+ The task is as follows: The model is given a long, multi-turn, synthetically generated
29
+ conversation between user and model where the user asks for a piece of writing about a topic,
30
+ e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
31
+ are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
32
+ of one of those asks. For example, "Return the 2nd poem about tapirs".
33
+
34
+ Reference: https://huggingface.co/datasets/openai/mrcr"""
35
+
36
+ name = "openai_mrcr"
37
+ description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)." # noqa: E501
38
+ tags = ["long_context", "mrcr"]
39
+
40
+ NEEDLES_OPTIONS = [2, 4, 8]
41
+
42
+ def __init__(self, needles: int, max_num_words: Optional[int] = None):
43
+ super().__init__()
44
+ self.needles = needles
45
+ self.max_num_words = max_num_words
46
+ if needles not in self.NEEDLES_OPTIONS:
47
+ raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
48
+ self.tokenizer = tiktoken.get_encoding("o200k_base")
49
+
50
+ def count_words(self, messages: list[dict]) -> int:
51
+ return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
52
+
53
+ def get_instances(self, output_path: str) -> List[Instance]:
54
+ cache_dir = os.path.join(output_path, "data")
55
+ ensure_directory_exists(cache_dir)
56
+ dataset = datasets.load_dataset(
57
+ "openai/mrcr",
58
+ cache_dir=cache_dir,
59
+ split="train",
60
+ data_files=[f"{self.needles}needle.parquet"],
61
+ revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
62
+ )
63
+ instances = []
64
+ for idx, row in enumerate(dataset):
65
+ messages = json.loads(row["prompt"])
66
+ if self.max_num_words and self.count_words(messages) > self.max_num_words:
67
+ continue
68
+ input = Input(messages=messages)
69
+ references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
70
+ instance = Instance(
71
+ id=f"{self.needles}needle{idx}",
72
+ input=input,
73
+ references=references,
74
+ split=TEST_SPLIT,
75
+ extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
76
+ )
77
+ instances.append(instance)
78
+
79
+ return instances
@@ -125,7 +125,12 @@ class PubMedQAScenario(Scenario):
125
125
  """
126
126
 
127
127
  name = "pubmed_qa"
128
- description = "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions."
128
+ description = (
129
+ "PubMedQA is a biomedical question-answering dataset that evaluates a model's"
130
+ "ability to interpret scientific literature. It consists of PubMed abstracts paired with"
131
+ "yes/no/maybe questions derived from the content. The benchmark assesses a model's"
132
+ "capability to reason over biomedical texts and provide factually grounded answers."
133
+ )
129
134
  tags = ["question_answering", "biomedical"]
130
135
 
131
136
  POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
@@ -1,6 +1,6 @@
1
1
  import csv
2
+ import os
2
3
 
3
- from filelock import FileLock
4
4
  from typing import Dict, List
5
5
  from docx import Document
6
6
 
@@ -13,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
13
13
  Reference,
14
14
  Output,
15
15
  )
16
+ from helm.common.general import ensure_file_downloaded
16
17
 
17
18
 
18
19
  def extract_red_text_runs(document):
@@ -87,10 +88,19 @@ class RaceBasedMedScenario(Scenario):
87
88
  """
88
89
 
89
90
  name = "race_based_med"
90
- description = "A collection of LLM outputs in response to medical questions with race-based biases, \
91
- with the objective being to classify whether the output contains racially biased content."
91
+ description = (
92
+ "RaceBias is a benchmark used to evaluate language models for racially biased or"
93
+ "inappropriate content in medical question-answering scenarios. Each instance consists"
94
+ "of a medical question and a model-generated response. The task is to classify whether"
95
+ "the response contains race-based, harmful, or inaccurate content. This benchmark"
96
+ "supports research into bias detection and fairness in clinical AI systems."
97
+ )
92
98
  tags = ["knowledge", "reasoning", "biomedical"]
93
99
  POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
100
+ FILE_URL: str = (
101
+ "https://static-content.springer.com/esm/"
102
+ "art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
103
+ )
94
104
 
95
105
  def create_benchmark(self, csv_path) -> Dict[str, str]:
96
106
  data = {}
@@ -114,12 +124,12 @@ class RaceBasedMedScenario(Scenario):
114
124
  return data
115
125
 
116
126
  def get_instances(self, output_path: str) -> List[Instance]:
117
- data_path = "/share/pi/nigam/data/medhelm/race_based/race_based.csv"
118
127
  # Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
119
- word_file = "/share/pi/nigam/data/medhelm/race_based/race_based.docx"
120
- lock_path = data_path + ".lock"
121
- with FileLock(lock_path):
122
- # if not os.path.exists(data_path):
128
+ data_path = os.path.join(output_path, "race_based.csv")
129
+
130
+ if not os.path.exists(data_path):
131
+ word_file = os.path.join(output_path, "race_based.docx")
132
+ ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
123
133
  create_csv_from_word(word_file, data_path)
124
134
 
125
135
  instances: List[Instance] = []