crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,246 @@
1
+ from abc import abstractmethod
2
+ from typing import Dict, List, Tuple, Optional
3
+
4
+ import random
5
+ from datasets import load_dataset
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TRAIN_SPLIT,
11
+ TEST_SPLIT,
12
+ VALID_SPLIT,
13
+ CORRECT_TAG,
14
+ PassageQuestionInput,
15
+ Input,
16
+ Output,
17
+ )
18
+
19
+
20
+ class MELTClosedBookQAScenario(Scenario):
21
+ name = "melt_closed_book_qa"
22
+ description = "Closed Book Question Answering scenario."
23
+ tags = ["question_answering"]
24
+
25
+ def __init__(
26
+ self,
27
+ dataset_name: str,
28
+ revision: str,
29
+ subset: Optional[str] = None,
30
+ splits: Optional[Dict[str, str]] = None,
31
+ ):
32
+ """
33
+ Initializes the question answering scenario.
34
+
35
+ Args:
36
+ dataset_name: The name of the dataset.
37
+ revision: The revision of the dataset to use.
38
+ subset: The subset of the dataset to use. Defaults to "".
39
+ splits: The splits to use for the dataset. Defaults to None.
40
+ """
41
+ super().__init__()
42
+ self.dataset_name = dataset_name
43
+ self.subset = subset
44
+ self.revision = revision
45
+ self.splits = splits
46
+
47
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
48
+ """
49
+ Helper for generating instances for a split.
50
+ Args:
51
+ splits (dict): Which splits to partition the data into.
52
+ Returns:
53
+ List[Instance]: Instances from the file for the specified split.
54
+ """
55
+ instances: List[Instance] = []
56
+ dataset = load_dataset(
57
+ self.dataset_name,
58
+ self.subset,
59
+ revision=self.revision,
60
+ trust_remote_code=True,
61
+ )
62
+ for dataset_split_name, helm_split_name in splits.items():
63
+
64
+ for sample in dataset[dataset_split_name]:
65
+ instance = Instance(
66
+ input=Input(text=sample["question"]),
67
+ references=[Reference(Output(text=sample["answer"]), tags=[CORRECT_TAG])],
68
+ split=helm_split_name,
69
+ )
70
+ instances.append(instance)
71
+
72
+ return instances
73
+
74
+ def get_instances(self, output_path: str) -> List[Instance]:
75
+ if self.splits is None:
76
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
77
+ else:
78
+ splits = {}
79
+ if "train" in self.splits:
80
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
81
+ if "validation" in self.splits:
82
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
83
+ if "test" in self.splits:
84
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
85
+
86
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
87
+ return instances
88
+
89
+
90
+ class MELTKnowledgeZaloScenario(MELTClosedBookQAScenario):
91
+ """
92
+ Scenario for the Zalo dataset.
93
+ """
94
+
95
+ name = "melt_knowledge_zalo"
96
+ description = "Zalo dataset for closed-book question answering."
97
+ tags = ["question_answering", "knowledge"]
98
+
99
+ def __init__(self):
100
+ super().__init__(
101
+ dataset_name="ura-hcmut/zalo_e2eqa",
102
+ revision="63494521f4de949bfa57a5f0b79bc3ee47e635ad",
103
+ splits={
104
+ TRAIN_SPLIT: "train",
105
+ TEST_SPLIT: "test",
106
+ },
107
+ )
108
+
109
+
110
+ class MELTMultipleChoiceQAScenario(Scenario):
111
+ name = "melt_multiple_choice_qa"
112
+ description = "Multiple Choice Question Answering scenario."
113
+ tags = ["question_answering"]
114
+
115
+ def __init__(
116
+ self,
117
+ dataset_name: str,
118
+ revision: str,
119
+ subset: Optional[str] = None,
120
+ splits: Optional[Dict[str, str]] = None,
121
+ ):
122
+ """
123
+ Initializes the question answering scenario.
124
+
125
+ Args:
126
+ dataset_name: The name of the dataset.
127
+ revision: The revision of the dataset to use.
128
+ subset: The subset of the dataset to use. Defaults to "".
129
+ splits: The splits to use for the dataset. Defaults to None.
130
+ """
131
+ super().__init__()
132
+ self.dataset_name = dataset_name
133
+ self.subset = subset
134
+ self.revision = revision
135
+ self.splits = splits
136
+
137
+ @abstractmethod
138
+ def process_example(self, sample: dict) -> Tuple[Input, List[Reference]]:
139
+ """
140
+ Given an sample from the dataset, create the input text and
141
+ list of answers for the instance.
142
+ """
143
+ pass
144
+
145
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
146
+ """
147
+ Helper for generating instances for a split.
148
+ Args:
149
+ splits (dict): Which splits to partition the data into.
150
+ Returns:
151
+ List[Instance]: Instances from the file for the specified split.
152
+ """
153
+ instances: List[Instance] = []
154
+ dataset = load_dataset(
155
+ self.dataset_name,
156
+ self.subset,
157
+ revision=self.revision,
158
+ trust_remote_code=True,
159
+ )
160
+ for dataset_split_name, helm_split_name in splits.items():
161
+ for sample in dataset[dataset_split_name]:
162
+ inputs, references = self.process_example(sample)
163
+ instance = Instance(
164
+ input=inputs,
165
+ references=references,
166
+ split=helm_split_name,
167
+ )
168
+ instances.append(instance)
169
+
170
+ return instances
171
+
172
+ def get_instances(self, output_path: str) -> List[Instance]:
173
+ if self.splits is None:
174
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
175
+ else:
176
+ splits = {}
177
+ if "train" in self.splits:
178
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
179
+ if "validation" in self.splits:
180
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
181
+ if "test" in self.splits:
182
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
183
+
184
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
185
+ return instances
186
+
187
+
188
+ class MELTKnowledgeViMMRCScenario(MELTMultipleChoiceQAScenario):
189
+ """
190
+ Scenario for the ViMMRC dataset.
191
+ """
192
+
193
+ name = "melt_knowledge_vimmrc"
194
+ description = "ViMMRC dataset for multiple choice question answering."
195
+ tags = ["question_answering", "knowledge"]
196
+
197
+ def __init__(self, randomize_order: bool = False):
198
+ super().__init__(
199
+ dataset_name="ura-hcmut/ViMMRC",
200
+ revision="fe68800e37aaa84d80b1d93466b36c3fa60d8bcb",
201
+ splits={
202
+ TRAIN_SPLIT: "train",
203
+ VALID_SPLIT: "validation",
204
+ TEST_SPLIT: "test",
205
+ },
206
+ )
207
+ self.randomize_order = randomize_order
208
+ self.correct_answer_mapping = {
209
+ "A": 0,
210
+ "B": 1,
211
+ "C": 2,
212
+ "D": 3,
213
+ "E": 4,
214
+ "F": 5,
215
+ "G": 6,
216
+ "H": 7,
217
+ }
218
+ random.seed(42)
219
+
220
+ def process_example(self, sample: dict) -> Tuple[Input, List[Reference]]:
221
+ """
222
+ Given an sample from the dataset, create the input text and
223
+ list of answers for the instance.
224
+ """
225
+ inputs = PassageQuestionInput(
226
+ passage=sample["article"],
227
+ passage_prefix="Ngữ cảnh: ",
228
+ question=sample["question"],
229
+ question_prefix="Câu hỏi: ",
230
+ separator="\n\n",
231
+ )
232
+
233
+ correct_idx = self.correct_answer_mapping[sample["answer"]]
234
+ references = []
235
+ for idx, answer in enumerate(eval(sample["options"])):
236
+ if idx == correct_idx:
237
+ tags = [CORRECT_TAG]
238
+ else:
239
+ tags = []
240
+
241
+ references.append(Reference(Output(text=answer), tags=tags))
242
+
243
+ if self.randomize_order:
244
+ random.shuffle(references)
245
+
246
+ return inputs, references
@@ -0,0 +1,252 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import random
4
+ from datasets import load_dataset
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ TEST_SPLIT,
11
+ VALID_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MELTLMMaskFillingScenario(Scenario):
19
+ """
20
+ Scenario for the MELT Masked Language Modeling dataset.
21
+ """
22
+
23
+ name = "melt_lm_mask_filling"
24
+ description = "Masked Language Modeling scenario."
25
+ tags = ["language_modeling", "mask_filling"]
26
+
27
+ def __init__(
28
+ self,
29
+ dataset_name: str,
30
+ revision: str,
31
+ masked_ratio: float = 0.1,
32
+ text_key: str = "text",
33
+ subset: Optional[str] = None,
34
+ splits: Optional[Dict[str, str]] = None,
35
+ ):
36
+ """Initializes the question answering scenario.
37
+
38
+ Args:
39
+ dataset_name: The name of the dataset.
40
+ revision: The revision of the dataset to use.
41
+ masked_ratio: The ratio of tokens to mask in the input text. Defaults to 0.1.
42
+ text_key: The key to use for the text in the dataset. Defaults to "text".
43
+ subset: The subset of the dataset to use. Defaults to "".
44
+ splits: The splits to use for the dataset. Defaults to None.
45
+ """
46
+ super().__init__()
47
+ self.dataset_name = dataset_name
48
+ self.subset = subset
49
+ self.masked_ratio = masked_ratio
50
+ self.text_key = text_key
51
+ self.revision = revision
52
+ self.splits = splits
53
+
54
+ def _mask_text(self, text: str) -> str:
55
+ """
56
+ Mask a portion of the input text.
57
+ Args:
58
+ text (str): The input text to mask.
59
+ Returns:
60
+ str: The masked text.
61
+ """
62
+ tokens = text.split(" ")
63
+ num_tokens_to_mask = int(len(tokens) * self.masked_ratio)
64
+ indices_to_mask = random.sample(range(len(tokens)), num_tokens_to_mask)
65
+ for index in indices_to_mask:
66
+ tokens[index] = "[MASK]"
67
+ return " ".join(tokens)
68
+
69
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
70
+ """
71
+ Helper for generating instances for a split.
72
+ Args:
73
+ splits (dict): Which splits to partition the data into.
74
+ Returns:
75
+ List[Instance]: Instances from the file for the specified split.
76
+ """
77
+ instances: List[Instance] = []
78
+ dataset: Any = load_dataset(
79
+ self.dataset_name,
80
+ self.subset,
81
+ revision=self.revision,
82
+ trust_remote_code=True,
83
+ )
84
+
85
+ for dataset_split_name, helm_split_name in splits.items():
86
+ for sample in dataset[dataset_split_name]:
87
+ target_sentence = sample[self.text_key]
88
+ source_sentence = self._mask_text(target_sentence)
89
+ instances.append(
90
+ Instance(
91
+ input=Input(text=source_sentence),
92
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
93
+ split=helm_split_name,
94
+ )
95
+ )
96
+ return instances
97
+
98
+ def get_instances(self, output_path: str) -> List[Instance]:
99
+ if self.splits is None:
100
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
101
+ else:
102
+ splits = {}
103
+ if "train" in self.splits:
104
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
105
+ if "validation" in self.splits:
106
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
107
+ if "test" in self.splits:
108
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
109
+
110
+ random.seed(42)
111
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
112
+ return instances
113
+
114
+
115
+ class MELTLMMaskFillingMLQAScenario(MELTLMMaskFillingScenario):
116
+ """
117
+ Scenario for the MLQA dataset.
118
+ This dataset is a multilingual question answering dataset.
119
+ It contains questions in multiple languages and their corresponding
120
+ answers in the same language. In this scenario, we are using the
121
+ context of questions in the Vietnamese subset of the MLQA dataset.
122
+ """
123
+
124
+ name = "melt_lm_mask_filling_mlqa"
125
+ description = "MLQA dataset for masked language modeling."
126
+ tags = ["language_modeling", "mask_filling"]
127
+
128
+ def __init__(self):
129
+ super().__init__(
130
+ dataset_name="facebook/mlqa",
131
+ revision="397ed406c1a7902140303e7faf60fff35b58d285",
132
+ subset="mlqa.vi.vi",
133
+ text_key="context",
134
+ splits={
135
+ VALID_SPLIT: "validation",
136
+ TEST_SPLIT: "test",
137
+ },
138
+ )
139
+
140
+
141
+ class MELTLMSpellingCorrectionScenario(Scenario):
142
+ """
143
+ Scenario for the MELT spelling correction dataset.
144
+ """
145
+
146
+ name = "melt_lm_spelling_correction"
147
+ description = "Spelling Correction scenario."
148
+ tags = ["language_modeling", "spelling_correction"]
149
+
150
+ def __init__(
151
+ self,
152
+ dataset_name: str,
153
+ revision: str,
154
+ source_key: str = "text",
155
+ target_key: str = "corrected_text",
156
+ subset: Optional[str] = None,
157
+ splits: Optional[Dict[str, str]] = None,
158
+ ):
159
+ """Initializes the question answering scenario.
160
+
161
+ Args:
162
+ dataset_name: The name of the dataset.
163
+ revision: The revision of the dataset to use.
164
+ source_key: The key to use for the source text in the dataset. Defaults to "text".
165
+ target_key: The key to use for the target text in the dataset. Defaults to "corrected_text".
166
+ subset: The subset of the dataset to use. Defaults to "".
167
+ splits: The splits to use for the dataset. Defaults to None.
168
+ """
169
+ super().__init__()
170
+ self.dataset_name = dataset_name
171
+ self.subset = subset
172
+ self.source_key = source_key
173
+ self.target_key = target_key
174
+ self.revision = revision
175
+ self.splits = splits
176
+
177
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
178
+ """
179
+ Helper for generating instances for a split.
180
+ Args:
181
+ splits (dict): Which splits to partition the data into.
182
+ Returns:
183
+ List[Instance]: Instances from the file for the specified split.
184
+ """
185
+ instances: List[Instance] = []
186
+ dataset: Any = load_dataset(
187
+ self.dataset_name,
188
+ self.subset,
189
+ revision=self.revision,
190
+ trust_remote_code=True,
191
+ )
192
+ if len(splits) == 1:
193
+ all_keys = list(splits.keys())
194
+ dataset = dataset[all_keys[0]].train_test_split(test_size=0.33, seed=42)
195
+ splits = {
196
+ "train": TRAIN_SPLIT,
197
+ "test": TEST_SPLIT,
198
+ }
199
+
200
+ for dataset_split_name, helm_split_name in splits.items():
201
+ for sample in dataset[dataset_split_name]:
202
+ source_sentence = sample[self.source_key]
203
+ target_sentence = sample[self.target_key]
204
+ instances.append(
205
+ Instance(
206
+ input=Input(text=source_sentence),
207
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
208
+ split=helm_split_name,
209
+ )
210
+ )
211
+ return instances
212
+
213
+ def get_instances(self, output_path: str) -> List[Instance]:
214
+ if self.splits is None:
215
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
216
+ else:
217
+ splits = {}
218
+ if "train" in self.splits:
219
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
220
+ if "validation" in self.splits:
221
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
222
+ if "test" in self.splits:
223
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
224
+
225
+ random.seed(42)
226
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
227
+ return instances
228
+
229
+
230
+ class MELTLMSpellingCorrectionVSECScenario(MELTLMSpellingCorrectionScenario):
231
+ """
232
+ Scenario for the VSEC dataset.
233
+ The VSEC dataset is a Vietnamese spelling correction dataset.
234
+ It contains 9,341 pairs of sentences where the first sentence is a misspelled
235
+ version of the second sentence, which is the correct version.
236
+ The mistakes are common spelling errors made by Vietnamese speakers and typists.
237
+ """
238
+
239
+ name = "melt_lm_spelling_correction_vsec"
240
+ description = "VSEC dataset for spelling correction."
241
+ tags = ["language_modeling", "spelling_correction"]
242
+
243
+ def __init__(self):
244
+ super().__init__(
245
+ dataset_name="ura-hcmut/VSEC",
246
+ revision="a6732e131605b5ec24ecc1745c6061c5ae86814e",
247
+ source_key="text",
248
+ target_key="correct",
249
+ splits={
250
+ TEST_SPLIT: "test",
251
+ },
252
+ )