crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
102
104
  instances.append(instance)
103
105
 
104
106
  return instances
107
+
108
+ def get_metadata(self) -> ScenarioMetadata:
109
+ return ScenarioMetadata(
110
+ name="legal_support",
111
+ display_name="LegalSupport",
112
+ description="Scenario introduced in this work to measure fine-grained legal reasoning "
113
+ "through reverse entailment.",
114
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
115
+ main_metric="quasi_exact_match",
116
+ main_split="test",
117
+ )
@@ -5,6 +5,7 @@ import datasets
5
5
  from pathlib import Path
6
6
  from typing import List, Dict
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
9
10
  from helm.benchmark.scenarios.scenario import (
10
11
  Scenario,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  TEST_SPLIT,
16
17
  Input,
17
18
  Output,
19
+ ScenarioMetadata,
18
20
  )
19
21
 
20
22
  PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
@@ -144,3 +146,21 @@ class LegalBenchScenario(Scenario):
144
146
  instances.append(instance)
145
147
 
146
148
  return instances
149
+
150
+ def get_metadata(self) -> ScenarioMetadata:
151
+ return ScenarioMetadata(
152
+ name="legalbench",
153
+ display_name="LegalBench",
154
+ description="LegalBench is a large collaboratively constructed benchmark of legal "
155
+ "reasoning. Five representative tasks are included here. See [(Guha et al, "
156
+ "2023)[https://arxiv.org/abs/2308.11462] for more details.",
157
+ taxonomy=TaxonomyInfo(
158
+ task="text classification",
159
+ what="fact patterns, questions, and legal documents",
160
+ when="n/a",
161
+ who="lawyers",
162
+ language="English",
163
+ ),
164
+ main_metric="quasi_exact_match",
165
+ main_split="test",
166
+ )
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
16
16
  TEST_SPLIT,
17
17
  Input,
18
18
  Output,
19
+ ScenarioMetadata,
19
20
  )
20
21
 
21
22
  ECTHR_A = "ecthr_a"
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
261
262
  for subset in self.subsets:
262
263
  instances.extend(self.get_instances_for_subset(subset, output_path))
263
264
  return instances
265
+
266
+ def get_metadata(self) -> ScenarioMetadata:
267
+ return ScenarioMetadata(
268
+ name="lex_glue",
269
+ display_name="LexGLUE",
270
+ description="A Benchmark Dataset for Legal Language Understanding in English",
271
+ taxonomy=None,
272
+ main_metric="classification_macro_f1",
273
+ main_split="test",
274
+ )
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
16
16
  TEST_SPLIT,
17
17
  Output,
18
18
  Input,
19
+ ScenarioMetadata,
19
20
  )
20
21
 
21
22
 
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
466
467
  for subset in self.subsets:
467
468
  instances.extend(self.get_instances_for_subset(subset, output_path))
468
469
  return instances
470
+
471
+ def get_metadata(self) -> ScenarioMetadata:
472
+ return ScenarioMetadata(
473
+ name="lextreme",
474
+ display_name="LEXTREME",
475
+ description="A Multilingual Legal Benchmark for Natural Language Understanding",
476
+ taxonomy=None,
477
+ main_metric="classification_macro_f1",
478
+ main_split="test",
479
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import json
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  PassageQuestionInput,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -143,3 +145,15 @@ class LSATScenario(Scenario):
143
145
  instances.append(instance)
144
146
 
145
147
  return instances
148
+
149
+ def get_metadata(self) -> ScenarioMetadata:
150
+ return ScenarioMetadata(
151
+ name="lsat_qa",
152
+ display_name="LSAT",
153
+ description="The LSAT benchmark for measuring analytical reasoning on the Law School "
154
+ "Admission Test (LSAT; [Zhong et al., "
155
+ "2021](https://arxiv.org/pdf/2104.06598.pdf)).",
156
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
157
+ main_metric="quasi_exact_match",
158
+ main_split="test",
159
+ )
@@ -0,0 +1,73 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ TRAIN_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
17
+
18
+
19
+ class MadinahQAScenario(Scenario):
20
+ """MadinahQA Scenario"""
21
+
22
+ name = "madinah_qa"
23
+ description = "Arabic language competency benchmark"
24
+ tags = ["language", "multiple_choice"]
25
+
26
+ OPTIONS = ["A", "B", "C", "D"]
27
+ HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
28
+ SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"]
29
+
30
+ def __init__(self, subset: str):
31
+ super().__init__()
32
+ subset = subset.replace("_", " ")
33
+ if subset not in self.SUBSETS:
34
+ raise Exception(f"Unknown subset: {subset}")
35
+ self.subset = subset
36
+
37
+ def get_instances(self, output_path: str) -> List[Instance]:
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+ instances: List[Instance] = []
41
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
42
+ "MBZUAI/MadinahQA",
43
+ self.subset,
44
+ revision="62e7c86ac5c07245a5a952722691d77ddb41f695",
45
+ cache_dir=cache_dir,
46
+ )
47
+
48
+ # Read all instances
49
+ for split_name, dataset in dataset_splits.items():
50
+ assert isinstance(dataset, datasets.Dataset)
51
+ for row_index, row in enumerate(dataset):
52
+ input = Input(text=row["Question"])
53
+ references: List[Reference] = []
54
+ correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
55
+ for option_index in range(1, 6):
56
+ column_name = f"Option {option_index}"
57
+ if not row[column_name]:
58
+ continue
59
+ references.append(
60
+ Reference(
61
+ output=Output(text=row[column_name]),
62
+ tags=[CORRECT_TAG] if option_index == correct_option_index else [],
63
+ )
64
+ )
65
+ instance = Instance(
66
+ id=f"id{row_index}",
67
+ input=input,
68
+ references=references,
69
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
70
+ )
71
+ instances.append(instance)
72
+
73
+ return instances
@@ -4,6 +4,7 @@ import typing
4
4
  from typing import Dict, List, Optional
5
5
  from datasets import load_dataset, DatasetDict
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
@@ -450,3 +452,27 @@ class MATHScenario(Scenario):
450
452
  instances.append(instance)
451
453
 
452
454
  return instances
455
+
456
+ def get_metadata(self) -> ScenarioMetadata:
457
+ if self.use_chain_of_thought:
458
+ return ScenarioMetadata(
459
+ name="math_chain_of_thought",
460
+ display_name="MATH (chain-of-thought)",
461
+ description="The MATH benchmark for measuring mathematical problem solving on competition "
462
+ "math problems with chain-of-thought style reasoning [(Hendrycks et al., "
463
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
464
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
465
+ main_metric="math_equiv_chain_of_thought",
466
+ main_split="test",
467
+ )
468
+ else:
469
+ return ScenarioMetadata(
470
+ name="math_regular",
471
+ display_name="MATH",
472
+ description="The MATH benchmark for measuring mathematical problem solving on competition "
473
+ "math problems [(Hendrycks et al., "
474
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
475
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
476
+ main_metric="math_equiv",
477
+ main_split="test",
478
+ )
@@ -0,0 +1,68 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MBZUAIHumanTranslatedArabicMMLUScenario(Scenario):
19
+ """MBZUAI Human-Translated Arabic MMLU
20
+
21
+ A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark from this paper:
22
+
23
+ - https://arxiv.org/pdf/2009.03300.pdf
24
+ """ # noqa: E501
25
+
26
+ name = "mbzuai_human_translated_arabic_mmlu"
27
+ description = (
28
+ "A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark"
29
+ )
30
+ tags = ["knowledge", "multiple_choice"]
31
+
32
+ def __init__(self, subject: str):
33
+ super().__init__()
34
+ self.subject: str = subject
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ cache_dir = os.path.join(output_path, "data")
38
+ ensure_directory_exists(cache_dir)
39
+ dataset = datasets.load_dataset(
40
+ "MBZUAI/human_translated_arabic_mmlu",
41
+ self.subject,
42
+ revision="5ed7830fd678cfa6f2d7f0a1a13a4e1a1fa422ac",
43
+ cache_dir=cache_dir,
44
+ split="test",
45
+ )
46
+ assert isinstance(dataset, datasets.Dataset)
47
+
48
+ # Read all instances
49
+ instances: List[Instance] = []
50
+ for row_index, row in enumerate(dataset):
51
+ input = Input(text=row["question"])
52
+ references: List[Reference] = []
53
+ for choice_index, choice in enumerate(row["choices"]):
54
+ references.append(
55
+ Reference(
56
+ output=Output(text=choice),
57
+ tags=[CORRECT_TAG] if choice_index == row["answer"] else [],
58
+ )
59
+ )
60
+ instance = Instance(
61
+ id=f"id-{self.subject}-{row_index}",
62
+ input=input,
63
+ references=references,
64
+ split=TEST_SPLIT,
65
+ )
66
+ instances.append(instance)
67
+
68
+ return instances
@@ -2,8 +2,18 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, Input, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ ALL_SPLITS,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ ScenarioMetadata,
16
+ )
7
17
 
8
18
 
9
19
  class MedDialogScenario(Scenario):
@@ -133,3 +143,24 @@ class MedDialogScenario(Scenario):
133
143
  )
134
144
 
135
145
  return instances
146
+
147
+ def get_metadata(self):
148
+ return ScenarioMetadata(
149
+ name="med_dialog",
150
+ display_name="MedDialog",
151
+ short_display_name="MedDialog",
152
+ description="MedDialog is a benchmark of real-world doctor-patient conversations focused on "
153
+ "health-related concerns and advice. Each dialogue is paired with a "
154
+ "one-sentence summary that reflects the core patient question or exchange. The "
155
+ "benchmark evaluates a model's ability to condense medical dialogue into "
156
+ "concise, informative summaries.",
157
+ taxonomy=TaxonomyInfo(
158
+ task="Text generation",
159
+ what="Generate summaries of doctor-patient conversations",
160
+ when="Any",
161
+ who="Clinician",
162
+ language="English",
163
+ ),
164
+ main_metric="med_dialog_accuracy",
165
+ main_split="test",
166
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  VALID_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -109,3 +111,15 @@ class MedMCQAScenario(Scenario):
109
111
  instances.append(instance)
110
112
 
111
113
  return instances
114
+
115
+ def get_metadata(self):
116
+ return ScenarioMetadata(
117
+ name="med_mcqa",
118
+ display_name="MedMCQA",
119
+ description='MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to '
120
+ "address real-world medical entrance exam questions ([Flores et al. "
121
+ "2020](https://arxiv.org/abs/2203.14371)).",
122
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
123
+ main_metric="exact_match",
124
+ main_split="valid",
125
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  VALID_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -103,3 +105,15 @@ class MedQAScenario(Scenario):
103
105
  instances.append(instance)
104
106
 
105
107
  return instances
108
+
109
+ def get_metadata(self) -> ScenarioMetadata:
110
+ return ScenarioMetadata(
111
+ name="med_qa",
112
+ display_name="MedQA",
113
+ description="MedQA is an open domain question answering dataset composed of questions from "
114
+ "professional medical board exams ([Jin et al. "
115
+ "2020](https://arxiv.org/pdf/2009.13081.pdf)).",
116
+ taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
117
+ main_metric="quasi_exact_match",
118
+ main_split="test",
119
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import List
2
2
 
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.benchmark.scenarios.scenario import (
4
5
  Scenario,
5
6
  Instance,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
8
9
  CORRECT_TAG,
9
10
  PassageQuestionInput,
10
11
  Output,
12
+ ScenarioMetadata,
11
13
  )
12
14
  from helm.benchmark.scenarios.medalign_scenario_helper import return_dataset_dataframe # type: ignore
13
15
 
@@ -92,3 +94,24 @@ class MedalignScenario(Scenario):
92
94
  def get_instances(self, output_path: str) -> List[Instance]:
93
95
  dataset = return_dataset_dataframe(self.max_length, self.data_path)
94
96
  return self.process_tsv(dataset)
97
+
98
+ def get_metadata(self):
99
+ return ScenarioMetadata(
100
+ name="medalign",
101
+ display_name="MedAlign",
102
+ short_display_name="MedAlign",
103
+ description="MedAlign is a benchmark that evaluates a model's ability to interpret and "
104
+ "follow instructions grounded in longitudinal electronic health records (EHR). "
105
+ "Each instance includes an event-stream style patient record and a natural "
106
+ "language question or task, requiring clinically informed reading comprehension "
107
+ "and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).",
108
+ taxonomy=TaxonomyInfo(
109
+ task="Text generation",
110
+ what="Answer questions and follow instructions over longitudinal EHR",
111
+ when="Any",
112
+ who="Clinician, Researcher",
113
+ language="English",
114
+ ),
115
+ main_metric="medalign_accuracy",
116
+ main_split="test",
117
+ )
@@ -3,6 +3,7 @@ import csv
3
3
  import sys
4
4
  from typing import List
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  CORRECT_TAG,
8
9
  TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Output,
12
13
  Reference,
13
14
  Scenario,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_file_downloaded
16
18
 
@@ -143,3 +145,23 @@ class MedBulletsScenario(Scenario):
143
145
  csv_path = self.download_csv(output_path, split_suffix)
144
146
  instances.extend(self.process_csv(csv_path, split))
145
147
  return instances
148
+
149
+ def get_metadata(self):
150
+ return ScenarioMetadata(
151
+ name="medbullets",
152
+ display_name="Medbullets",
153
+ description="Medbullets is a benchmark of USMLE-style medical questions designed to assess "
154
+ "a model's ability to understand and apply clinical knowledge. Each question is "
155
+ "accompanied by a patient scenario and five multiple-choice options, similar to "
156
+ "those found on Step 2 and Step 3 board exams [(MedBullets, "
157
+ "2025)](https://step2.medbullets.com).",
158
+ taxonomy=TaxonomyInfo(
159
+ task="Question answering",
160
+ what="Medical knowledge testing",
161
+ when="Any",
162
+ who="Medical student, . Researcher",
163
+ language="English",
164
+ ),
165
+ main_metric="exact_match",
166
+ main_split="test",
167
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, List
2
2
  from datasets import load_dataset
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.hierarchical_logger import hlog
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  PassageQuestionInput,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
 
15
17
 
@@ -125,3 +127,23 @@ class MedCalcBenchScenario(Scenario):
125
127
  instances.extend(self.process_csv(data, split))
126
128
 
127
129
  return instances
130
+
131
+ def get_metadata(self):
132
+ return ScenarioMetadata(
133
+ name="medcalc_bench",
134
+ display_name="MedCalc-Bench",
135
+ description="MedCalc-Bench is a benchmark designed to evaluate models on their ability to "
136
+ "compute clinically relevant values from patient notes. Each instance consists "
137
+ "of a clinical note describing the patient's condition, a diagnostic question "
138
+ "targeting a specific medical value, and a ground truth response. [(Khandekar "
139
+ "et al., 2024)](https://arxiv.org/abs/2406.12036).",
140
+ taxonomy=TaxonomyInfo(
141
+ task="Computational reasoning",
142
+ what="Compute a specific medical value from a patient note",
143
+ when="Any",
144
+ who="Clinician, Researcher",
145
+ language="English",
146
+ ),
147
+ main_metric="medcalc_bench_accuracy",
148
+ main_split="test",
149
+ )
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import os
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  TEST_SPLIT,
10
11
  Input,
11
12
  Output,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_file_downloaded
14
16
 
@@ -123,3 +125,24 @@ class MedecScenario(Scenario):
123
125
  instances.extend(self.process_csv(test_csv, TEST_SPLIT))
124
126
 
125
127
  return instances
128
+
129
+ def get_metadata(self):
130
+ return ScenarioMetadata(
131
+ name="medec",
132
+ display_name="Medec",
133
+ description="Medec is a benchmark composed of clinical narratives that include either "
134
+ "correct documentation or medical errors. Each entry includes sentence-level "
135
+ "identifiers and an associated correction task. The model must review the "
136
+ "narrative and either identify the erroneous sentence and correct it, or "
137
+ "confirm that the text is entirely accurate [(Abacha et al., "
138
+ "2025)](https://arxiv.org/abs/2412.19260).",
139
+ taxonomy=TaxonomyInfo(
140
+ task="Classification",
141
+ what="Detect and correct errors in medical narratives",
142
+ when="Any",
143
+ who="Researcher, Clinician",
144
+ language="English",
145
+ ),
146
+ main_metric="medec_error_flag_accuracy",
147
+ main_split="test",
148
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
2
  from datasets import load_dataset
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  CORRECT_TAG,
10
11
  Output,
11
12
  Input,
13
+ ScenarioMetadata,
12
14
  )
13
15
 
14
16
 
@@ -70,3 +72,24 @@ Answer: {answer}
70
72
  )
71
73
  instances.append(hallucinated_instance)
72
74
  return instances
75
+
76
+ def get_metadata(self):
77
+ return ScenarioMetadata(
78
+ name="medhallu",
79
+ display_name="MedHallu",
80
+ description="MedHallu is a benchmark focused on evaluating factual correctness in "
81
+ "biomedical question answering. Each instance contains a PubMed-derived "
82
+ "knowledge snippet, a biomedical question, and a model-generated answer. The "
83
+ "task is to classify whether the answer is factually correct or contains "
84
+ "hallucinated (non-grounded) information. This benchmark is designed to assess "
85
+ "the factual reliability of medical language models.",
86
+ taxonomy=TaxonomyInfo(
87
+ task="Classification",
88
+ what="Verify whether answers to questions from PubMed articles are " "factual or hallucinated",
89
+ when="Any",
90
+ who="Researcher",
91
+ language="English",
92
+ ),
93
+ main_metric="exact_match",
94
+ main_split="test",
95
+ )
File without changes
@@ -0,0 +1,14 @@
1
+ # The judges to be used for evaluating the note summary scenario.
2
+ # name: The short name for the judge.
3
+ # model: The field value matching the 'model_name' field under model_deployments.yaml
4
+ # model_deployment: The field value matching the 'name' under model_deployments.yaml.
5
+ judges:
6
+ - name: "gpt"
7
+ model: "openai/gpt-4o-2024-05-13"
8
+ model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13"
9
+ - name: "llama"
10
+ model: "meta/llama-3.3-70b-instruct"
11
+ model_deployment: "stanfordhealthcare/llama-3.3-70b-instruct"
12
+ - name: "claude"
13
+ model: "anthropic/claude-3-7-sonnet-20250219"
14
+ model_deployment: "stanfordhealthcare/claude-3-7-sonnet-20250219"