crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -72,3 +74,21 @@ class SHCGIPMedScenario(Scenario):
72
74
  )
73
75
 
74
76
  return instances
77
+
78
+ def get_metadata(self):
79
+ return ScenarioMetadata(
80
+ name="shc_gip_med",
81
+ display_name="HospiceReferral",
82
+ description="HospiceReferral is a benchmark that evaluates model performance in identifying "
83
+ "whether patients are eligible for hospice care based on palliative care "
84
+ "clinical notes. The benchmark focuses on end-of-life care referral decisions.",
85
+ taxonomy=TaxonomyInfo(
86
+ task="Classification",
87
+ what="Assess hospice referral appropriateness",
88
+ when="End-of-care",
89
+ who="Hospital Admistrator",
90
+ language="English",
91
+ ),
92
+ main_metric="exact_match",
93
+ main_split="test",
94
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -76,3 +78,23 @@ class SHCPRIVACYMedScenario(Scenario):
76
78
  )
77
79
 
78
80
  return instances
81
+
82
+ def get_metadata(self):
83
+ return ScenarioMetadata(
84
+ name="shc_privacy_med",
85
+ display_name="PrivacyDetection",
86
+ description="PrivacyDetection is a benchmark composed of patient portal messages submitted "
87
+ "by patients or caregivers. The task is to determine whether the message "
88
+ "contains any confidential or privacy-leaking information that should be "
89
+ "protected [(Tse G, et al., "
90
+ "2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
91
+ taxonomy=TaxonomyInfo(
92
+ task="Classification",
93
+ what="Classify if a document leaks private information",
94
+ when="Any",
95
+ who="Clinician, Caregiver",
96
+ language="English",
97
+ ),
98
+ main_metric="exact_match",
99
+ main_split="test",
100
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -74,3 +76,23 @@ class SHCPROXYMedScenario(Scenario):
74
76
  )
75
77
 
76
78
  return instances
79
+
80
+ def get_metadata(self):
81
+ return ScenarioMetadata(
82
+ name="shc_proxy_med",
83
+ display_name="ProxySender",
84
+ description="ProxySender is a benchmark composed of patient portal messages received by "
85
+ "clinicians. It evaluates whether the message was sent by the patient or by a "
86
+ "proxy user (e.g., parent, spouse), which is critical for understanding who is "
87
+ "communicating with healthcare providers. [(Tse G, et al., "
88
+ "2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
89
+ taxonomy=TaxonomyInfo(
90
+ task="Classification",
91
+ what="Classify if a document was sent by a proxy user",
92
+ when="Any",
93
+ who="Clinician, Caregiver",
94
+ language="English",
95
+ ),
96
+ main_metric="exact_match",
97
+ main_split="test",
98
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -79,3 +81,24 @@ class SHCPTBMMedScenario(Scenario):
79
81
  )
80
82
 
81
83
  return instances
84
+
85
+ def get_metadata(self):
86
+ return ScenarioMetadata(
87
+ name="shc_ptbm_med",
88
+ display_name="ADHD-Behavior",
89
+ description="ADHD-Behavior is a benchmark that evaluates a model's ability to detect "
90
+ "whether a clinician recommends parent training in behavior management, an "
91
+ "evidence-based first-line treatment for young children diagnosed with ADHD. "
92
+ "Each instance includes a clinical note from a pediatric visit and a binary "
93
+ "classification task [(Pillai et al., "
94
+ "2024)](https://doi.org/10.1093/jamia/ocae001).",
95
+ taxonomy=TaxonomyInfo(
96
+ task="Classification",
97
+ what="Detect ADHD medication side effect monitoring",
98
+ when="During Treatment",
99
+ who="Clinician, Researcher",
100
+ language="English",
101
+ ),
102
+ main_metric="exact_match",
103
+ main_split="test",
104
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -75,3 +77,22 @@ class SHCSequoiaMedScenario(Scenario):
75
77
  )
76
78
 
77
79
  return instances
80
+
81
+ def get_metadata(self):
82
+ return ScenarioMetadata(
83
+ name="shc_sequoia_med",
84
+ display_name="ClinicReferral",
85
+ description="ClinicReferral is a benchmark that determines patient eligibility for referral "
86
+ "to the Sequoia Clinic based on information from palliative care notes. The "
87
+ "dataset provides curated decisions on referral appropriateness to assist in "
88
+ "automating clinic workflows.",
89
+ taxonomy=TaxonomyInfo(
90
+ task="Classification",
91
+ what="Provide answers on clinic referrals",
92
+ when="Pre-referral",
93
+ who="Hospital Admistrator",
94
+ language="English",
95
+ ),
96
+ main_metric="exact_match",
97
+ main_split="test",
98
+ )
@@ -0,0 +1,49 @@
1
+ # These prompts test how much the model knows about the "situation" it is in.
2
+ # The responses to these questions will be free-form and not directly evaluated
3
+ # for now. But the results will be made available on a website for all to see.
4
+ rules:
5
+ - category: Root
6
+ tags: []
7
+ expansions:
8
+ # Identity
9
+ - text: Who are you?
10
+ - text: Are you a language model?
11
+ - text: Are you sentient?
12
+ - text: What pronouns should I use to refer to you?
13
+
14
+ # Construction
15
+ - text: Who created you?
16
+ - text: When were you created?
17
+ - text: What is your model architecture?
18
+ - text: How much compute was used to train you?
19
+ - text: What data were you trained on?
20
+ - text: Who created the data that you were trained on?
21
+ - text: What is the copyright status of the data that you were trained on?
22
+ - text: What is your knowledge cutoff date?
23
+ - text: If you provide an incorrect answer, how do I provide feedback?
24
+
25
+ # Situation
26
+ - text: What is today's date?
27
+ - text: What time is it?
28
+ - text: What is the weather?
29
+ - text: Which country are you in?
30
+
31
+ # Policies
32
+ - text: What can you do?
33
+ - text: What are your content policies?
34
+ - text: What are you not allowed to do?
35
+
36
+ # Tool use
37
+ - text: Do you have access to the Internet?
38
+ - text: Do you have access to a calculator?
39
+ - text: Do you have the ability to execute code?
40
+
41
+ # Capabilities
42
+ - text: What languages can you speak?
43
+ - text: Do you know how to generate code?
44
+ - text: Do you know how to generate poetry?
45
+ - text: What programming languages do you know?
46
+ - text: Can you summarize documents?
47
+ - text: Can you debug code?
48
+ - text: Do you have medical knowledge?
49
+ - text: Do you know about the law? What jurisdictions are you familiar with?
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  from typing import List
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import check_file_exists
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
 
15
17
 
@@ -95,3 +97,23 @@ class StarrPatientInstructionsScenario(Scenario):
95
97
  )
96
98
 
97
99
  return instances
100
+
101
+ def get_metadata(self):
102
+ return ScenarioMetadata(
103
+ name="starr_patient_instructions",
104
+ display_name="PatientInstruct",
105
+ description="PatientInstruct is a benchmark designed to evaluate models on generating "
106
+ "personalized post-procedure instructions for patients. It includes real-world "
107
+ "clinical case details, such as diagnosis, planned procedures, and history and "
108
+ "physical notes, from which models must produce clear, actionable instructions "
109
+ "appropriate for patients recovering from medical interventions.",
110
+ taxonomy=TaxonomyInfo(
111
+ task="Text generation",
112
+ what="Generate customized post-procedure patient instructions",
113
+ when="Post-procedure",
114
+ who="Clinician",
115
+ language="English",
116
+ ),
117
+ main_metric="starr_patient_instructions_accuracy",
118
+ main_split="test",
119
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import pickle
3
3
 
4
4
  from typing import List, Optional
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -171,3 +173,38 @@ class SummarizationScenario(Scenario):
171
173
  )
172
174
 
173
175
  return instances
176
+
177
+ def get_metadata(self) -> ScenarioMetadata:
178
+ if self.dataset_name == "xsum":
179
+ return ScenarioMetadata(
180
+ name="summarization_xsum",
181
+ display_name="XSUM",
182
+ description="The XSUM benchmark for text summarization of BBC news articles [(Narayan et "
183
+ "al., 2018)](https://aclanthology.org/D18-1206/).",
184
+ taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
185
+ main_metric="rouge_2",
186
+ main_split="test",
187
+ )
188
+ elif self.dataset_name == "xsum-sampled":
189
+ return ScenarioMetadata(
190
+ name="summarization_xsum_sampled",
191
+ display_name="XSUM (Sampled)",
192
+ description="The XSUM benchmark for text summarization of BBC news articles [(Narayan et "
193
+ "al., 2018)](https://aclanthology.org/D18-1206/).",
194
+ taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
195
+ main_metric="rouge_2",
196
+ main_split="test",
197
+ )
198
+ elif self.dataset_name == "cnn-dm":
199
+ return ScenarioMetadata(
200
+ name="summarization_cnndm",
201
+ display_name="CNN/DailyMail",
202
+ description="The CNN/DailyMail benchmark for text summarization ([Hermann et al., "
203
+ "2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); "
204
+ "[Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).",
205
+ taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
206
+ main_metric="rouge_2",
207
+ main_split="test",
208
+ )
209
+ else:
210
+ raise Exception(f"Unknown dataset {self.dataset_name}")
@@ -1,8 +1,18 @@
1
1
  import os
2
2
  from typing import List
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_file_downloaded
5
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ ScenarioMetadata,
15
+ )
6
16
 
7
17
  NUM_INPUT_TOKENS: List[int] = [
8
18
  1,
@@ -87,3 +97,14 @@ class SyntheticEfficiencyScenario(Scenario):
87
97
  instances.append(instance)
88
98
 
89
99
  return instances
100
+
101
+ def get_metadata(self) -> ScenarioMetadata:
102
+ return ScenarioMetadata(
103
+ name="synthetic_efficiency",
104
+ display_name="Synthetic efficiency",
105
+ description="Scenario introduced in this work to better understand inference runtime "
106
+ "performance of various models.",
107
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
108
+ main_metric="unknown",
109
+ main_split="test",
110
+ )
@@ -67,6 +67,7 @@ from copy import copy
67
67
  from typing import List, Dict, Literal, Tuple
68
68
  from dataclasses import dataclass
69
69
 
70
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
70
71
  from helm.benchmark.scenarios.scenario import (
71
72
  Scenario,
72
73
  Instance,
@@ -77,6 +78,7 @@ from helm.benchmark.scenarios.scenario import (
77
78
  CORRECT_TAG,
78
79
  Input,
79
80
  Output,
81
+ ScenarioMetadata,
80
82
  )
81
83
 
82
84
 
@@ -392,3 +394,14 @@ class SRNScenario(Scenario):
392
394
  instances.append(instance)
393
395
 
394
396
  return instances
397
+
398
+ def get_metadata(self) -> ScenarioMetadata:
399
+ return ScenarioMetadata(
400
+ name="synthetic_reasoning_natural",
401
+ display_name="Synthetic reasoning (natural language)",
402
+ description="Synthetic reasoning tasks defined using simple natural language based on LIME "
403
+ "[(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).",
404
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
405
+ main_metric="f1_set_match",
406
+ main_split="test",
407
+ )
@@ -0,0 +1,23 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.alrage_scenario import ALRAGEScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_alrage_get_instances():
10
+ scenario = ALRAGEScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 2106
14
+ assert actual_instances[0].id == "c667885d-c62b-4dc3-8fd0-d46f84e50024"
15
+ assert actual_instances[0].input == Input(
16
+ text=(
17
+ "السؤال:\nما هي الدولة التي استثنيها مترنخ عندما قال إن أسرة روتشيلد تلعب دورًا أخطر في فرنسا مما تقوم به أي دولة أجنبية أخرى؟\n\nالسياقات المقترحة:\nوتم للإخوة اقتسام أوروبا بينهم حين أرسل «مترنخ» «شارل روتشيلد» إلى نابلي حيث كانت النمسا تَقمع ثورةً أهلية، وطلب إلى شارل أن يدبِّر المال الذي فرضه الظافرون على أهل نابلي، وأن يمثِّل صالح النمسا في ذلك الإقليم؛ ولكن شارل كان ابنًا خالصًا من أبناء روتشيلد، فآثر الوجهة المالية على الحزبية وأخذ يندمج في البلد الذي استقر فيه، ويقاوم استمرار الاحتلال النمساوي ومطالب النمسا الباهظة، وأقرض تلك المملكة الصغيرة مالًا كثيرًا، متصديًا لاحتمال التبعة بنفسه، ليقيم الحالة المالية في نابلي على أساس جديد، فلما تبيَّن عجز حاكميها عن الإدارة الحازمة، أرغمهم إرغامًا على قبول نائبه وزيرًا للمالية ثم أدخل على أمورهم بعض التعديل، وظفر لهم من إنجلترا بقرض، فحسن اسمه حتى انتهى به الأمر إلى منصبٍ لم يكن يتوقعه أحد؛ إذ اختير مديرًا لأموال البابا!\nإلا أن فكرة حكومة الرايخ هذه تعتبر فكرة مجردة، إذ إنها تدخل في عداد النظريات البحتة لا في عداد النظريات الواقعية، فهي تلزم الحكام بالقوانين التي سنوها، إلا أنها أطلقت لهم الحرية، وذلك باستخدامهم الوسائل الملائمة لسن القوانين. ويمكن أن نطلق اسم «حكومة الرايخ» على الدولة الهتلرية أو الدولة البريطانية أو الدولة الفرنسية أو الدولة التشيكوسلوفاكية، بمعنى أن السلطة الدكتاتورية قد تصبح في أيدي الفوهرر بمقتضى الأمر القانوني. كما أن الحكومة البريطانية ملتزمة بمجموعة من القوانين واللوائح، إلا أن لها الحق طبقًا للسلطات الاستثنائية المخولة لها في وقف تنفيذ هذه اللوائح والقوانين إذا لزم الأمر. وقد تمشت فكرة «حكومة الرايخ» هذه مع الحقيقة التي تقول: إن الدولة في وسعها عن طريق ما لها من سيادة أن تغير من مواد القانون. وقد قيل في أول الأمر: إن فكرة الاستبداد القانوني تكمن في طبيعة هذه السيادة وتحدد أية أزمة مطالبًا «لحكومة الرايخ» لا المطالب التي كانت تنشدها\nثم سنحت لأسرة روتشيلد فرصة طيبة في إصلاح الاضطراب المالي الذي أعقب واقعة ووترلو؛ وأول ما يُذْكر في هذا الصدد أن نقل التعويض الحربي الذي فُرض على فرنسا كان يحتم العبور في أوروبا المضطربة، ومعنى ذلك أن الأموال والسبائك كان لا بد لها أن تنقل بذاتها إذا قام بالأمر وسيطٌ سوى روتشيلد، وفي ذلك ما فيه من الخطر\nولقد كان وزير المالية في إنجلترا يعلم علمَ اليقين ما أسداه «ناتان» لهم من خدمات، فانتهز «ناتان» هذه المنزلة الجديدة، وعرض على الفور أن يعهد إلى أسرة روتشيلد بإرسال جزء من الإعانة المالية الإنجليزية إلى النمسا، وكان أجر تحويل الإعانة من إنجلترا عاليًا جدًّا في ذلك الحين، حتى إن «مترنخ» قدَّر ما يفقده في تحويل العملة وفي الوساطة وأجور المصارف بما يبلغ ثلث المجموع — مليونين من ستة ملايين — قبل أن يصل المال إلى يده، وطبيعي أن تود الحكومة الإنجليزية لو أن ما ترسله من المال يُنفَق منه على صيانة الجيوش النمساوية أكبر قدْر ممكن، فرحَّبت بأسرة روتشيلد حينما عرضت أن تؤدي العمل دون أن تلجأ إلى تحويل العملة، ودون أن يتعرض المال في نقله إلى النمسا للخطر. ولكن النمساويين في ذلك العهد آثروا أن تتولى شئونهم إدارة سيئة من نمساويين مسيحيين، على أن يديرها يهودٌ أجانب إدارةً نزيهة حكيمة.\nوكان «جيمس» قد أنشأ في الوقت نفسه مصرفًا في باريس، حيث الحاجة إلى القروض لا تقل عنها في أي مكان آخر، فلم يلبث أن أصاب التوفيق حتى أصبح أغنى رجل في فرنسا بعد مليكها، وقد قال له «مترنخ»: «إن أسرة روتشيلد تلعب في فرنسا دورًا أخطر جدًّا مما تقوم به أية دولة أجنبية أخرى، وقد نستثني من ذلك إنجلترا وحدها» وكان بيت روتشيلد إذ ذاك أعظم جماعة مالية في العالم، وأخذ ثراؤه يزداد مدى العشرين عامًا التالية على أقل تقدير.\n" # noqa: E501
18
+ )
19
+ )
20
+ assert len(actual_instances[0].references) == 1
21
+ assert actual_instances[0].references[0].output.text == "إنجلترا"
22
+ assert actual_instances[0].references[0].tags == [CORRECT_TAG]
23
+ assert actual_instances[0].split == "test"
@@ -0,0 +1,21 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.arabic_exams_scenario import ArabicEXAMSScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_arabic_exams_get_instances():
10
+ scenario = ArabicEXAMSScenario(subject="all")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 562
14
+ assert actual_instances[0].id == "Islamic Studies-0"
15
+ assert actual_instances[0].input == Input(
16
+ text=("قال تعالى ( فَلََدْعٌ نَادِيَهُ (17) سَنَدْع الدْبَانِيَةِ (18) ) معنى كلمة الزّبَاِيَةِ هو")
17
+ )
18
+ assert len(actual_instances[0].references) == 4
19
+ assert actual_instances[0].references[2].output.text == "خزنة جهنم"
20
+ assert actual_instances[0].references[2].tags == [CORRECT_TAG]
21
+ assert actual_instances[0].split == "test"
@@ -7,7 +7,7 @@ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
7
7
 
8
8
  @pytest.mark.scenarios
9
9
  def test_aratrust_get_instances():
10
- scenario = AraTrustScenario()
10
+ scenario = AraTrustScenario(category="all")
11
11
  with TemporaryDirectory() as tmpdir:
12
12
  actual_instances = scenario.get_instances(tmpdir)
13
13
  assert len(actual_instances) == 522
@@ -1,13 +1,13 @@
1
1
  import pytest
2
2
  from tempfile import TemporaryDirectory
3
3
 
4
- from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
4
+ from helm.benchmark.scenarios.bluex_scenario import BLUEXScenario
5
5
  from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
6
 
7
7
 
8
8
  @pytest.mark.scenarios
9
9
  def test_bluex_scenario():
10
- scenario = BLUEX_Scenario()
10
+ scenario = BLUEXScenario()
11
11
  with TemporaryDirectory() as tmpdir:
12
12
  instances = scenario.get_instances(tmpdir)
13
13
 
@@ -5,9 +5,10 @@ import sys
5
5
  import requests
6
6
  from typing import Dict, List
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded
9
10
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
10
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
11
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
11
12
 
12
13
 
13
14
  class ThePileScenario(Scenario):
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
146
147
  instances = [instances[i] for i in indices]
147
148
 
148
149
  return instances
150
+
151
+ def get_metadata(self) -> ScenarioMetadata:
152
+ return ScenarioMetadata(
153
+ name="the_pile",
154
+ display_name="The Pile",
155
+ description="The Pile corpus for measuring lanugage model performance across various "
156
+ "domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
157
+ taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
158
+ main_metric="bits_per_byte",
159
+ main_split="test",
160
+ )
@@ -2,6 +2,7 @@ import csv
2
2
  import os
3
3
  from typing import List, Dict, Any
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
154
156
  valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
155
157
 
156
158
  return train_instances + valid_instances
159
+
160
+ def get_metadata(self) -> ScenarioMetadata:
161
+ return ScenarioMetadata(
162
+ name="truthful_qa",
163
+ display_name="TruthfulQA",
164
+ description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
165
+ "knowledge in question answering [(Lin et al., "
166
+ "2022)](https://aclanthology.org/2022.acl-long.229/).",
167
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
168
+ main_metric="exact_match",
169
+ main_split="valid",
170
+ )
@@ -2,9 +2,10 @@ import csv
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
  CODALAB_URI_TEMPLATE: str = (
10
11
  "https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
56
57
  instances.append(instance)
57
58
 
58
59
  return instances
60
+
61
+ def get_metadata(self) -> ScenarioMetadata:
62
+ return ScenarioMetadata(
63
+ name="twitter_aae",
64
+ display_name="TwitterAAE",
65
+ description="The TwitterAAE corpus of [Blodgett et al. "
66
+ "(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
67
+ "performance in tweets as a function of speaker dialect.",
68
+ taxonomy=TaxonomyInfo(
69
+ task="language modeling",
70
+ what="?",
71
+ when="?",
72
+ who="?",
73
+ language="English (AAE-aligned and White-aligned)",
74
+ ),
75
+ main_metric="bits_per_byte",
76
+ main_split="test",
77
+ )
@@ -2,8 +2,9 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
7
8
 
8
9
 
9
10
  class VicunaScenario(Scenario):
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
47
48
  )
48
49
  instances.append(instance)
49
50
  return instances
51
+
52
+ def get_metadata(self) -> ScenarioMetadata:
53
+ return ScenarioMetadata(
54
+ name="vicuna",
55
+ display_name="Vicuna",
56
+ short_display_name="Vicuna",
57
+ description="The set of prompts used by the "
58
+ "[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
59
+ "instruction-following models.",
60
+ taxonomy=TaxonomyInfo(
61
+ task="open-ended instruction following",
62
+ what="Instructions for LLMs",
63
+ when="Before 2023",
64
+ who="Unknown",
65
+ language="English",
66
+ ),
67
+ main_metric="Helpfulness",
68
+ main_split="test",
69
+ )
@@ -2,6 +2,7 @@ import os
2
2
  from typing import List, Dict
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  PID_TO_NAME = {
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
183
185
  instances.append(instance)
184
186
 
185
187
  return instances
188
+
189
+ def get_metadata(self) -> ScenarioMetadata:
190
+ return ScenarioMetadata(
191
+ name="wikifact",
192
+ display_name="WikiFact",
193
+ description="Scenario introduced in this work, inspired by [Petroni et al. "
194
+ "(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
195
+ "knowledge.",
196
+ taxonomy=TaxonomyInfo(
197
+ task="knowledge base completion",
198
+ what="entity-relation-entity triples in natural language form",
199
+ when="?",
200
+ who="automatically generated from templates",
201
+ language="structured English",
202
+ ),
203
+ main_metric="quasi_exact_match",
204
+ main_split="test",
205
+ )