crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import datasets
2
2
  import os
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  Input,
10
11
  Output,
11
12
  CORRECT_TAG,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_directory_exists
14
16
 
@@ -51,3 +53,19 @@ class OmniMATHScenario(Scenario):
51
53
  instances.append(instance)
52
54
 
53
55
  return instances
56
+
57
+ def get_metadata(self) -> ScenarioMetadata:
58
+ return ScenarioMetadata(
59
+ name=self.name,
60
+ display_name="Omni-MATH",
61
+ description=self.description,
62
+ main_metric="omni_math_accuracy",
63
+ main_split="test",
64
+ taxonomy=TaxonomyInfo(
65
+ task="mathematics",
66
+ what="universal Olympiad level mathematic benchmark",
67
+ who="human annotators",
68
+ when="2024",
69
+ language="English",
70
+ ),
71
+ )
@@ -2,6 +2,7 @@ from typing import List, Dict, Any, DefaultDict
2
2
  from datasets import load_dataset, Dataset
3
3
  from collections import defaultdict
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  CORRECT_TAG,
7
8
  Reference,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  TRAIN_SPLIT,
12
13
  VALID_SPLIT,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -126,3 +128,23 @@ class OpenAssistantScenario(Scenario):
126
128
  valid_instances = get_split_instances(dataset["validation"], VALID_SPLIT)
127
129
 
128
130
  return train_instances + valid_instances
131
+
132
+ def get_metadata(self) -> ScenarioMetadata:
133
+ return ScenarioMetadata(
134
+ name="open_assistant",
135
+ display_name="Open Assistant",
136
+ short_display_name="Open Assistant",
137
+ description="LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 "
138
+ "conversation trees ([Köpf et al., "
139
+ "2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial "
140
+ "prompt in each conversation.",
141
+ taxonomy=TaxonomyInfo(
142
+ task="open-ended instruction following",
143
+ what="Human-written dialogues and response rankings",
144
+ when="2023",
145
+ who="Open Assistant participants",
146
+ language="35 languages",
147
+ ),
148
+ main_metric="Helpfulness",
149
+ main_split="valid",
150
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Reference,
12
13
  PassageQuestionInput,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -186,3 +188,23 @@ class PubMedQAScenario(Scenario):
186
188
  instances.append(instance)
187
189
 
188
190
  return instances
191
+
192
+ def get_metadata(self):
193
+ return ScenarioMetadata(
194
+ name="pubmed_qa",
195
+ display_name="PubMedQA",
196
+ description="PubMedQA is a biomedical question-answering dataset that evaluates a model's "
197
+ "ability to interpret scientific literature. It consists of PubMed abstracts "
198
+ "paired with yes/no/maybe questions derived from the content. The benchmark "
199
+ "assesses a model's capability to reason over biomedical texts and provide "
200
+ "factually grounded answers.",
201
+ taxonomy=TaxonomyInfo(
202
+ task="Question answering",
203
+ what="Answer questions based on PubMed abstracts",
204
+ when="Any",
205
+ who="Researcher",
206
+ language="English",
207
+ ),
208
+ main_metric="exact_match",
209
+ main_split="test",
210
+ )
@@ -3,6 +3,7 @@ import os
3
3
  import random
4
4
  from typing import List, Tuple
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -192,3 +194,15 @@ class QuACScenario(Scenario):
192
194
  instances.extend(self.get_split_instances(split_path, split=split_tag))
193
195
 
194
196
  return instances
197
+
198
+ def get_metadata(self) -> ScenarioMetadata:
199
+ return ScenarioMetadata(
200
+ name="quac",
201
+ display_name="QuAC (Question Answering in Context)",
202
+ short_display_name="QuAC",
203
+ description="The QuAC benchmark for question answering in the context of dialogues [(Choi "
204
+ "et al., 2018)](https://aclanthology.org/D18-1241/).",
205
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
206
+ main_metric="f1_score",
207
+ main_split="valid",
208
+ )
@@ -4,6 +4,7 @@ import os
4
4
  from typing import Dict, List
5
5
  from docx import Document
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Input,
9
10
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Reference,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
  from helm.common.general import ensure_file_downloaded
17
19
 
@@ -150,3 +152,24 @@ class RaceBasedMedScenario(Scenario):
150
152
  )
151
153
 
152
154
  return instances
155
+
156
+ def get_metadata(self):
157
+ return ScenarioMetadata(
158
+ name="race_based_med",
159
+ display_name="RaceBias",
160
+ description="RaceBias is a benchmark used to evaluate language models for racially biased "
161
+ "or inappropriate content in medical question-answering scenarios. Each "
162
+ "instance consists of a medical question and a model-generated response. The "
163
+ "task is to classify whether the response contains race-based, harmful, or "
164
+ "inaccurate content. This benchmark supports research into bias detection and "
165
+ "fairness in clinical AI systems.",
166
+ taxonomy=TaxonomyInfo(
167
+ task="Classification",
168
+ what="Identify race-based bias in LLM-generated medical responses",
169
+ when="Any",
170
+ who="Researcher",
171
+ language="English",
172
+ ),
173
+ main_metric="exact_match",
174
+ main_split="test",
175
+ )
@@ -5,6 +5,7 @@ import datasets
5
5
  from pathlib import Path
6
6
  from typing import List, Dict
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
9
10
  from helm.benchmark.scenarios.scenario import (
10
11
  Scenario,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  TEST_SPLIT,
16
17
  Input,
17
18
  Output,
19
+ ScenarioMetadata,
18
20
  )
19
21
 
20
22
  PROMPT_SETTINGS_URL = "https://www.dropbox.com/s/a5cyevryzw8rt4f/prompt_construction_settings.json?dl=0"
@@ -144,3 +146,16 @@ class RAFTScenario(Scenario):
144
146
  instances.append(instance)
145
147
 
146
148
  return instances
149
+
150
+ def get_metadata(self) -> ScenarioMetadata:
151
+ return ScenarioMetadata(
152
+ name="raft",
153
+ display_name="RAFT (Real-world Annotated Few-Shot)",
154
+ short_display_name="RAFT",
155
+ description="The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text "
156
+ "classification tasks [(Alex et al., "
157
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).",
158
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
159
+ main_metric="quasi_exact_match",
160
+ main_split="test",
161
+ )
@@ -3,8 +3,9 @@ import os
3
3
  import random
4
4
  from typing import List, Dict, Optional
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
  TOXIC_SUB_SPLIT: str = "toxic"
10
11
  NONTOXIC_SUB_SPLIT: str = "non-toxic"
@@ -57,3 +58,15 @@ class RealToxicityPromptsScenario(Scenario):
57
58
  random.shuffle(instances)
58
59
 
59
60
  return instances
61
+
62
+ def get_metadata(self) -> ScenarioMetadata:
63
+ return ScenarioMetadata(
64
+ name="real_toxicity_prompts",
65
+ display_name="RealToxicityPrompts",
66
+ description="The RealToxicityPrompts dataset for measuring toxicity in prompted model "
67
+ "generations [(Gehman et al., "
68
+ "2020)](https://aclanthology.org/2020.findings-emnlp.301/).",
69
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
70
+ main_metric="unknown",
71
+ main_split="test",
72
+ )
@@ -5,6 +5,7 @@ import os
5
5
  from pathlib import PurePath
6
6
  import inspect
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.media_object import MultimediaObject
9
10
  from helm.common.object_spec import ObjectSpec, create_object
10
11
  from helm.common.general import ensure_directory_exists, format_text, format_split, format_tags, indent_lines
@@ -189,6 +190,33 @@ class Instance:
189
190
  return info
190
191
 
191
192
 
193
+ @dataclass(frozen=True)
194
+ class ScenarioMetadata:
195
+ name: str
196
+ """Internal name (usually no spaces, etc.)"""
197
+
198
+ main_metric: str
199
+
200
+ main_split: str
201
+
202
+ display_name: Optional[str] = None
203
+ """What is displayed to the user"""
204
+
205
+ short_display_name: Optional[str] = None
206
+ """What is displayed to the user (e.g., in a table header)"""
207
+
208
+ description: Optional[str] = None
209
+ """Description of the scenario"""
210
+
211
+ short_description: Optional[str] = None
212
+ """Optional short description of the scenario.
213
+ This description is used in some space-constrained places in frontend tables.
214
+ If unset, the description field will be used instead."""
215
+
216
+ taxonomy: Optional[TaxonomyInfo] = None
217
+ """Optional taxonomy"""
218
+
219
+
192
220
  # TODO(#1212): Scenario should not be a dataclass.
193
221
  @dataclass
194
222
  class Scenario(ABC):
@@ -249,6 +277,9 @@ class Scenario(ABC):
249
277
  output.append("}")
250
278
  return output
251
279
 
280
+ def get_metadata(self) -> ScenarioMetadata:
281
+ raise NotImplementedError()
282
+
252
283
 
253
284
  def with_instance_ids(instances: List[Instance]) -> List[Instance]:
254
285
  """Return the instances with an ID. Note: order of instances matters."""