crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,9 @@ import os
3
3
  import random
4
4
  from typing import List, Dict
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
 
10
11
  DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
@@ -76,3 +77,14 @@ class DecodingTrustToxicityPromptsScenario(Scenario):
76
77
  random.shuffle(instances)
77
78
 
78
79
  return instances
80
+
81
+ def get_metadata(self) -> ScenarioMetadata:
82
+ return ScenarioMetadata(
83
+ name="decodingtrust_toxicity_prompts",
84
+ display_name="DecodingTrust - Toxicity",
85
+ short_display_name="Toxicity",
86
+ description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
87
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
88
+ main_metric="unknown",
89
+ main_split="test",
90
+ )
@@ -1,4 +1,5 @@
1
1
  from typing import List
2
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
2
3
  from helm.common.general import check_file_exists
3
4
  from helm.benchmark.scenarios.scenario import (
4
5
  Input,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
8
9
  CORRECT_TAG,
9
10
  Reference,
10
11
  Output,
12
+ ScenarioMetadata,
11
13
  )
12
14
  import pandas as pd
13
15
 
@@ -170,3 +172,25 @@ class DischargeMeScenario(Scenario):
170
172
  lines = file.readlines()
171
173
  lines = [line.strip() for line in lines]
172
174
  return lines
175
+
176
+ def get_metadata(self):
177
+ return ScenarioMetadata(
178
+ name="dischargeme",
179
+ display_name="DischargeMe",
180
+ short_display_name="DischargeMe",
181
+ description="DischargeMe is a benchmark designed to evaluate clinical text generation. It "
182
+ "pairs discharge summaries and radiology reports from MIMIC-IV with generation "
183
+ "tasks such as writing discharge instructions or summarizing the brief hospital "
184
+ "course. The benchmark assesses a model's ability to generate patient-facing "
185
+ "documentation that is complete, empathetic, and clinically accurate [(Xu, "
186
+ "2024)](https://physionet.org/content/discharge-me/1.3/).",
187
+ taxonomy=TaxonomyInfo(
188
+ task="Text generation",
189
+ what="Generate discharge instructions from hospital notes",
190
+ when="Upon hospital discharge",
191
+ who="Clinician",
192
+ language="English",
193
+ ),
194
+ main_metric="dischargeme_accuracy",
195
+ main_split="test",
196
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import List, Dict, Optional
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
  REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
@@ -175,3 +177,23 @@ class DisinformationScenario(Scenario):
175
177
  instances = self.create_wedging_instances(data)
176
178
 
177
179
  return instances
180
+
181
+ def get_metadata(self) -> ScenarioMetadata:
182
+ if self.capability == "reiteration":
183
+ name = "disinformation_reiteration"
184
+ display_name = "Disinformation (reiteration)"
185
+ elif self.capability == "wedging":
186
+ name = "disinformation_wedging"
187
+ display_name = "Disinformation (wedging)"
188
+ else:
189
+ raise Exception(f"Unknown capability {self.capability}")
190
+ return ScenarioMetadata(
191
+ name=name,
192
+ display_name=display_name,
193
+ description="Scenario from [Buchanan et al. "
194
+ "(2021)](https://cset.georgetown.edu/publication/truth-lies-and-automation/) "
195
+ "that tests the ability to generate divisive and wedging content.",
196
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
197
+ main_metric="unknown",
198
+ main_split="valid",
199
+ )
@@ -2,6 +2,7 @@ import numpy as np
2
2
  import random
3
3
  from typing import List, Tuple
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -234,3 +236,16 @@ class DyckLanguageScenario(Scenario):
234
236
  not_allowed=train_inputs,
235
237
  )
236
238
  return train_instances + test_instances
239
+
240
+ def get_metadata(self) -> ScenarioMetadata:
241
+ return ScenarioMetadata(
242
+ name="dyck_language",
243
+ display_name="Dyck",
244
+ description="Scenario testing hierarchical reasoning through the Dyck formal languages "
245
+ "[(Suzgun et al., 2019)](https://aclanthology.org/W19-3905/).",
246
+ taxonomy=TaxonomyInfo(
247
+ task="next-word prediction", what="Dyck formal language", when="n/a", who="n/a", language="synthetic"
248
+ ),
249
+ main_metric="exact_match_indicator",
250
+ main_split="test",
251
+ )
@@ -7,6 +7,7 @@ from functools import partial
7
7
  from tqdm import tqdm
8
8
  from typing import Any, Dict, List, Optional, Mapping
9
9
 
10
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
10
11
  from helm.common.general import check_file_exists, ensure_directory_exists
11
12
  from helm.benchmark.scenarios.scenario import (
12
13
  TEST_SPLIT,
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
16
17
  CORRECT_TAG,
17
18
  Reference,
18
19
  Output,
20
+ ScenarioMetadata,
19
21
  )
20
22
 
21
23
  ##################################
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
1517
1519
  )
1518
1520
 
1519
1521
  return instances
1522
+
1523
+ def get_metadata(self):
1524
+ return ScenarioMetadata(
1525
+ name="ehrshot",
1526
+ display_name="EHRSHOT",
1527
+ description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
1528
+ "future clinical events using structured EHR code sequences. Each instance "
1529
+ "contains a patient's historical EHR data and a forward-looking clinical "
1530
+ "question about whether a particular diagnosis, lab result, or hospital event "
1531
+ "will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
1532
+ taxonomy=TaxonomyInfo(
1533
+ task="Classification",
1534
+ what="Predict whether a medical event will occur in the future based " "on EHR codes",
1535
+ when="Future prediction",
1536
+ who="Clinician, Insurer",
1537
+ language="English",
1538
+ ),
1539
+ main_metric="exact_match",
1540
+ main_split="test",
1541
+ )
@@ -2,6 +2,7 @@ from typing import List, Any
2
2
  from pathlib import Path
3
3
  from datasets import load_dataset
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  TEST_SPLIT,
11
12
  Input,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
 
15
17
 
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
56
58
  )
57
59
  instances.append(instance)
58
60
  return instances
61
+
62
+ def get_metadata(self) -> ScenarioMetadata:
63
+ return ScenarioMetadata(
64
+ name="enem_challenge",
65
+ display_name="ENEM Challenge",
66
+ short_display_name=None,
67
+ description="ENEM Challenge",
68
+ taxonomy=TaxonomyInfo(
69
+ task="multiple-choice question answering",
70
+ what="general academic subjects",
71
+ when="between 2009 and 2023",
72
+ who="brazilian ministry of education",
73
+ language="Portuguese",
74
+ ),
75
+ main_metric="exact_match",
76
+ main_split="test",
77
+ )
@@ -3,6 +3,7 @@ import pandas as pd
3
3
  from pathlib import Path
4
4
  from typing import List, Tuple
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.common.general import ensure_file_downloaded
8
9
  from helm.benchmark.scenarios.scenario import (
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  CORRECT_TAG,
16
17
  Input,
17
18
  Output,
19
+ ScenarioMetadata,
18
20
  )
19
21
 
20
22
 
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
160
162
  instances.append(instance)
161
163
 
162
164
  return instances
165
+
166
+ def get_metadata(self) -> ScenarioMetadata:
167
+ return ScenarioMetadata(
168
+ name="entity_data_imputation",
169
+ display_name="Data imputation",
170
+ description="Scenario from [Mei et al. "
171
+ "(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
172
+ "to impute missing entities in a data table.",
173
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
174
+ main_metric="quasi_exact_match",
175
+ main_split="test",
176
+ )
@@ -2,6 +2,7 @@ import pandas as pd
2
2
  from pathlib import Path
3
3
  from typing import Dict, List, Tuple
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.hierarchical_logger import hlog
6
7
  from helm.common.general import ensure_file_downloaded
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
  from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
19
21
 
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
155
157
  instances.append(instance)
156
158
 
157
159
  return instances
160
+
161
+ def get_metadata(self) -> ScenarioMetadata:
162
+ return ScenarioMetadata(
163
+ name="entity_matching",
164
+ display_name="Entity matching",
165
+ description="Scenario from Magellan [(Konda et al., "
166
+ "2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
167
+ "to determine if two entities match.",
168
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
169
+ main_metric="quasi_exact_match",
170
+ main_split="test",
171
+ )
@@ -0,0 +1,115 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ VALID_SPLIT,
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TEST_SPLIT,
13
+ TRAIN_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+ from helm.common.hierarchical_logger import hwarn
19
+
20
+
21
+ class EXAMSMultilingualScenario(Scenario):
22
+ """EXAMS: A Multi-subject High School Examinations Dataset
23
+
24
+ EXAMS is a benchmark dataset for multilingual and cross-lingual
25
+ question answering from high school examinations. It consists of
26
+ more than 24,000 high-quality high school exam questions in 16
27
+ languages, covering 8 language families and 24 school subjects
28
+ from Natural Sciences and Social Sciences, among others.
29
+
30
+ - https://huggingface.co/datasets/mhardalov/exams
31
+ - https://aclanthology.org/2020.emnlp-main.438/
32
+
33
+ Note: Some dataset rows have the value '@' in the `answerKey` column.
34
+ These rows will be ignored.
35
+
36
+ ```
37
+ @inproceedings{hardalov-etal-2020-exams,
38
+ title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
39
+ author = "Hardalov, Momchil and
40
+ Mihaylov, Todor and
41
+ Zlatkova, Dimitrina and
42
+ Dinkov, Yoan and
43
+ Koychev, Ivan and
44
+ Nakov, Preslav",
45
+ editor = "Webber, Bonnie and
46
+ Cohn, Trevor and
47
+ He, Yulan and
48
+ Liu, Yang",
49
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
50
+ month = nov,
51
+ year = "2020",
52
+ address = "Online",
53
+ publisher = "Association for Computational Linguistics",
54
+ url = "https://aclanthology.org/2020.emnlp-main.438/",
55
+ doi = "10.18653/v1/2020.emnlp-main.438",
56
+ pages = "5427--5444",
57
+ abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
58
+ }```
59
+ """ # noqa: E501
60
+
61
+ name = "exams_multilingual"
62
+ description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
63
+ tags = ["knowledge", "multiple_choice"]
64
+
65
+ CHOICES = ["A", "B", "C", "D", "E"]
66
+ HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
67
+
68
+ def __init__(self, language: str, subject: str):
69
+ super().__init__()
70
+ self.language = language
71
+ self.subject = subject
72
+
73
+ def get_instances(self, output_path: str) -> List[Instance]:
74
+ cache_dir = os.path.join(output_path, "data")
75
+ ensure_directory_exists(cache_dir)
76
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
77
+ "mhardalov/exams",
78
+ "multilingual",
79
+ revision="4ff10804abb3341f8815cacd778181177bba7edd",
80
+ cache_dir=cache_dir,
81
+ )
82
+
83
+ # Read all instances
84
+ instances: List[Instance] = []
85
+ for split_name, dataset in dataset_splits.items():
86
+ assert isinstance(dataset, datasets.Dataset)
87
+ for row in dataset:
88
+ question = row["question"]
89
+ question_info = row["info"]
90
+ if self.subject != "all" and question_info["subject"] != self.subject:
91
+ continue
92
+ if self.language != "all" and question_info["language"] != self.language:
93
+ continue
94
+ input = Input(text=question["stem"])
95
+ references: List[Reference] = []
96
+ if row["answerKey"] not in self.CHOICES:
97
+ hwarn(f"Invalid value in answerKey column in row: {row}")
98
+ continue
99
+ correct_choice_index = ord(row["answerKey"]) - ord("A")
100
+ for choice_index, choice_text in enumerate(question["choices"]["text"]):
101
+ references.append(
102
+ Reference(
103
+ output=Output(text=choice_text),
104
+ tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
105
+ )
106
+ )
107
+ instance = Instance(
108
+ id=row["id"],
109
+ input=input,
110
+ references=references,
111
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
112
+ )
113
+ instances.append(instance)
114
+
115
+ return instances
@@ -2,6 +2,7 @@ import os
2
2
  import random
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  TEST_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative""" # noqa: E501
92
94
  )
93
95
  instances.append(instance)
94
96
  return instances
97
+
98
+ def get_metadata(self) -> ScenarioMetadata:
99
+ return ScenarioMetadata(
100
+ name="financial_phrasebank",
101
+ display_name="Financial Phrasebank (Sentiment Classification)",
102
+ short_display_name=None,
103
+ description="A sentiment classification benchmark based on the dataset from Good Debt or "
104
+ "Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
105
+ "2013)](https://arxiv.org/abs/1307.5336).",
106
+ taxonomy=TaxonomyInfo(
107
+ task="sentiment analysis",
108
+ what="phrases from financial news texts and company press releases",
109
+ when="before 2013",
110
+ who="annotators with adequate business education background",
111
+ language="English",
112
+ ),
113
+ main_metric="classification_weighted_f1",
114
+ main_split="test",
115
+ )
@@ -6,6 +6,7 @@ from typing import List
6
6
 
7
7
  import pandas as pd
8
8
 
9
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
9
10
  from helm.benchmark.runner import TRAIN_SPLIT
10
11
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
11
12
  from helm.benchmark.scenarios.scenario import (
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
16
17
  Reference,
17
18
  Scenario,
18
19
  Output,
20
+ ScenarioMetadata,
19
21
  )
20
22
 
21
23
 
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
122
124
  for train_index in train_indexes:
123
125
  instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
124
126
  return instances
127
+
128
+ def get_metadata(self) -> ScenarioMetadata:
129
+ return ScenarioMetadata(
130
+ name="gold_commodity_news",
131
+ display_name="Gold Commodity News",
132
+ short_display_name=None,
133
+ description="A classification benchmark based on a dataset of human-annotated gold "
134
+ "commodity news headlines ([Sinha & Khandait, "
135
+ "2019](https://arxiv.org/abs/2009.04202)).",
136
+ taxonomy=TaxonomyInfo(
137
+ task="text classification",
138
+ what="gold commodity news headlines",
139
+ when="2000-2019",
140
+ who="financial journalists",
141
+ language="English",
142
+ ),
143
+ main_metric="classification_weighted_f1",
144
+ main_split="test",
145
+ )
@@ -2,6 +2,7 @@ import datasets
2
2
  import os
3
3
  import random
4
4
  from typing import List
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_directory_exists
16
18
 
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
78
80
  instances.append(instance)
79
81
 
80
82
  return instances
83
+
84
+ def get_metadata(self) -> ScenarioMetadata:
85
+ return ScenarioMetadata(
86
+ name=self.name,
87
+ display_name="GPQA",
88
+ description=self.description,
89
+ main_metric="chain_of_thought_correctness",
90
+ main_split="test",
91
+ taxonomy=TaxonomyInfo(
92
+ task="question answering",
93
+ what="complex questions across various disciplines",
94
+ who="domain experts",
95
+ when="2024",
96
+ language="English",
97
+ ),
98
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
2
 
3
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
4
5
  from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
5
6
 
6
7
 
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
41
42
  instances: List[Instance] = list(map(derivation_to_instance, derivations))
42
43
 
43
44
  return instances
45
+
46
+ def get_metadata(self) -> ScenarioMetadata:
47
+ return ScenarioMetadata(
48
+ name="grammar",
49
+ display_name="Best ChatGPT Prompts",
50
+ short_display_name="Best ChatGPT Prompts",
51
+ description="A list of “best ChatGPT prompts to power your workflow” summarized by "
52
+ "[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
53
+ taxonomy=TaxonomyInfo(
54
+ task="open-ended instruction following",
55
+ what="Instructions for LLMs",
56
+ when="2023",
57
+ who="Gridfiti Staff",
58
+ language="English",
59
+ ),
60
+ main_metric="Helpfulness",
61
+ main_split="test",
62
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  TEST_SPLIT,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -65,3 +67,16 @@ class GSM8KScenario(Scenario):
65
67
  ),
66
68
  )
67
69
  return instances
70
+
71
+ def get_metadata(self) -> ScenarioMetadata:
72
+ return ScenarioMetadata(
73
+ name="gsm",
74
+ display_name="GSM8K (Grade school math word problems)",
75
+ short_display_name="GSM8K",
76
+ description="The grade school math word problems dataset (GSM8K) for testing mathematical "
77
+ "reasoning on grade-school math problems [(Cobbe et al., "
78
+ "2021)](https://arxiv.org/pdf/2110.14168.pdf).",
79
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
80
+ main_metric="exact_match_indicator",
81
+ main_split="test",
82
+ )
@@ -3,6 +3,7 @@ from typing import List, Optional
3
3
 
4
4
  from datasets import DatasetDict, load_dataset
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  CORRECT_TAG,
8
9
  TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Output,
12
13
  Reference,
13
14
  Scenario,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_directory_exists
16
18
 
@@ -134,3 +136,23 @@ class HeadQAScenario(Scenario):
134
136
  )
135
137
 
136
138
  return instances
139
+
140
+ def get_metadata(self):
141
+ return ScenarioMetadata(
142
+ name="head_qa",
143
+ display_name="HeadQA",
144
+ description="HeadQA is a benchmark consisting of biomedical multiple-choice questions "
145
+ "intended to evaluate a model's medical knowledge and reasoning. Each instance "
146
+ "presents a clinical or scientific question with four answer options, requiring "
147
+ "the model to select the most appropriate answer [(Vilares et al., "
148
+ "2019)](https://arxiv.org/abs/1906.04701).",
149
+ taxonomy=TaxonomyInfo(
150
+ task="Question answering",
151
+ what="Medical knowledge testing",
152
+ when="Any",
153
+ who="Medical student, Researcher",
154
+ language="English",
155
+ ),
156
+ main_metric="exact_match",
157
+ main_split="test",
158
+ )