crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import pandas as pd
2
2
  from typing import List
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.benchmark.scenarios.scenario import (
4
5
  Scenario,
5
6
  Instance,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
8
9
  CORRECT_TAG,
9
10
  PassageQuestionInput,
10
11
  Output,
12
+ ScenarioMetadata,
11
13
  )
12
14
  from helm.common.general import check_file_exists
13
15
 
@@ -121,3 +123,24 @@ class MentalHealthScenario(Scenario):
121
123
  instances = self.process_dialogue_data(dialogue_data)
122
124
 
123
125
  return instances
126
+
127
+ def get_metadata(self):
128
+ return ScenarioMetadata(
129
+ name="mental_health",
130
+ display_name="MentalHealth",
131
+ description="MentalHealth is a benchmark focused on evaluating empathetic communication in "
132
+ "mental health counseling. It includes real or simulated conversations between "
133
+ "patients and counselors, where the task is to generate compassionate and "
134
+ "appropriate counselor responses. The benchmark assesses a model's ability to "
135
+ "support patients emotionally and meaningfully engage in therapeutic "
136
+ "conversations.",
137
+ taxonomy=TaxonomyInfo(
138
+ task="Text generation",
139
+ what="Generate empathetic counseling responses in mental health " "conversations",
140
+ when="Any",
141
+ who="Counselors, Patients",
142
+ language="English",
143
+ ),
144
+ main_metric="mental_health_accuracy",
145
+ main_split="test",
146
+ )
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  from typing import Dict, List
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import check_file_exists
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
@@ -10,11 +11,12 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
 
15
17
 
16
18
  class MIMICBHCScenario(Scenario):
17
- """
19
+ r"""
18
20
  MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
19
21
  course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
20
22
 
@@ -101,3 +103,25 @@ class MIMICBHCScenario(Scenario):
101
103
  )
102
104
 
103
105
  return instances
106
+
107
+ def get_metadata(self):
108
+ return ScenarioMetadata(
109
+ name="mimic_bhc",
110
+ display_name="MIMIC-IV-BHC",
111
+ short_display_name="MIMIC-BHC",
112
+ description="MIMIC-BHC is a benchmark focused on summarization of discharge notes into "
113
+ "Brief Hospital Course (BHC) sections. It consists of curated discharge notes "
114
+ "from MIMIC-IV, each paired with its corresponding BHC summary. The benchmark "
115
+ "evaluates a model's ability to condense detailed clinical information into "
116
+ "accurate, concise summaries that reflect the patient's hospital stay [(Aali et "
117
+ "al., 2024)](https://doi.org/10.1093/jamia/ocae312).",
118
+ taxonomy=TaxonomyInfo(
119
+ task="Text generation",
120
+ what="Summarize the clinical note into a brief hospital course",
121
+ when="Upon hospital discharge",
122
+ who="Clinician",
123
+ language="English",
124
+ ),
125
+ main_metric="mimic_bhc_accuracy",
126
+ main_split="test",
127
+ )
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import Dict, List
3
3
 
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import check_file_exists
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
 
15
17
 
@@ -96,3 +98,24 @@ class MIMICRRSScenario(Scenario):
96
98
  lines = file.readlines()
97
99
  lines = [line.strip() for line in lines]
98
100
  return lines
101
+
102
+ def get_metadata(self):
103
+ return ScenarioMetadata(
104
+ name="mimic_rrs",
105
+ display_name="MIMIC-RRS",
106
+ short_display_name="MIMIC-RRS",
107
+ description="MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III "
108
+ "database. It contains pairs of ‘Findings‘ and ‘Impression‘ sections, enabling "
109
+ "evaluation of a model's ability to summarize diagnostic imaging observations "
110
+ "into concise, clinically relevant conclusions [(Chen et al., "
111
+ "2023)](https://arxiv.org/abs/2211.08584).",
112
+ taxonomy=TaxonomyInfo(
113
+ task="Text generation",
114
+ what="Generate radiology report summaries from findings sections",
115
+ when="Post-imaging",
116
+ who="Radiologist",
117
+ language="English",
118
+ ),
119
+ main_metric="mimic_rrs_accuracy",
120
+ main_split="test",
121
+ )
@@ -2,6 +2,7 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import check_file_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Input,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Reference,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -75,3 +77,23 @@ class MIMICIVBillingCodeScenario(Scenario):
75
77
  continue
76
78
 
77
79
  return instances
80
+
81
+ def get_metadata(self):
82
+ return ScenarioMetadata(
83
+ name="mimiciv_billing_code",
84
+ display_name="MIMIC-IV Billing Code",
85
+ description="MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the "
86
+ "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The "
87
+ "task requires models to extract structured billing codes based on free-text "
88
+ "clinical notes, reflecting real-world hospital coding tasks for financial "
89
+ "reimbursement.",
90
+ taxonomy=TaxonomyInfo(
91
+ task="Classification",
92
+ what="Predict ICD-10 billing codes from clinical discharge notes",
93
+ when="During or after patient discharge",
94
+ who="Hospital Admistrator",
95
+ language="English",
96
+ ),
97
+ main_metric="mimiciv_billing_code_f1",
98
+ main_split="test",
99
+ )
@@ -2,6 +2,7 @@ from typing import Dict, List
2
2
  from datasets import Dataset, load_dataset
3
3
 
4
4
  from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -93,3 +95,19 @@ class MMLUProScenario(Scenario):
93
95
  instances.extend(self.process_dataset(data, split))
94
96
 
95
97
  return instances
98
+
99
+ def get_metadata(self) -> ScenarioMetadata:
100
+ return ScenarioMetadata(
101
+ name=self.name,
102
+ display_name="MMLU-Pro",
103
+ description=self.description,
104
+ main_metric="chain_of_thought_correctness",
105
+ main_split="test",
106
+ taxonomy=TaxonomyInfo(
107
+ task="question answering",
108
+ what="graduate-level questions in biology, physics, and chemistry",
109
+ who="domain experts",
110
+ when="2023",
111
+ language="English",
112
+ ),
113
+ )
@@ -2,6 +2,7 @@ import csv
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
@@ -124,3 +126,16 @@ class MMLUScenario(Scenario):
124
126
  instances.extend(self.process_csv(csv_path, splits[split]))
125
127
 
126
128
  return instances
129
+
130
+ def get_metadata(self) -> ScenarioMetadata:
131
+ return ScenarioMetadata(
132
+ name="mmlu",
133
+ display_name="MMLU (Massive Multitask Language Understanding)",
134
+ short_display_name="MMLU",
135
+ description="The Massive Multitask Language Understanding (MMLU) benchmark for "
136
+ "knowledge-intensive question answering across 57 domains [(Hendrycks et al., "
137
+ "2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).",
138
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
139
+ main_metric="exact_match",
140
+ main_split="test",
141
+ )
@@ -0,0 +1,85 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MMMLUScenario(Scenario):
19
+ """Multilingual Massive Multitask Language Understanding (MMMLU) by OpenAI
20
+
21
+ The MMLU is a widely recognized benchmark of general knowledge attained
22
+ by AI models. It covers a broad range of topics from 57 different categories,
23
+ covering elementary-level knowledge up to advanced professional subjects like
24
+ law, physics, history, and computer science.
25
+
26
+ MMMLU is a translation of MMLU’s test set into 14 languages using professional
27
+ human translators. Relying on human translators for this evaluation increases
28
+ confidence in the accuracy of the translations, especially for low-resource
29
+ languages like Yoruba.
30
+
31
+ The Massive Multitask Language Understanding benchmark from this paper:
32
+
33
+ - https://arxiv.org/pdf/2009.03300.pdf
34
+
35
+ The MMMLU dataset is from here:
36
+
37
+ - https://huggingface.co/datasets/openai/MMMLU
38
+ """
39
+
40
+ name = "mmmlu"
41
+ description = "Multilingual Massive Multitask Language Understanding"
42
+ tags = ["knowledge", "multiple_choice"]
43
+
44
+ OPTIONS = ["A", "B", "C", "D"]
45
+
46
+ def __init__(self, locale: str, subject: str):
47
+ super().__init__()
48
+ self.locale: str = locale
49
+ self.subject: str = subject
50
+
51
+ def get_instances(self, output_path: str) -> List[Instance]:
52
+ cache_dir = os.path.join(output_path, "data")
53
+ ensure_directory_exists(cache_dir)
54
+ dataset = datasets.load_dataset(
55
+ "openai/MMMLU",
56
+ self.locale,
57
+ revision="325a01dc3e173cac1578df94120499aaca2e2504",
58
+ cache_dir=cache_dir,
59
+ split="test",
60
+ )
61
+ assert isinstance(dataset, datasets.Dataset)
62
+
63
+ # Read all instances
64
+ instances: List[Instance] = []
65
+ for row_index, row in enumerate(dataset):
66
+ if self.subject != "all" and row["Subject"] != self.subject:
67
+ continue
68
+ input = Input(text=row["Question"])
69
+ references: List[Reference] = []
70
+ for option in self.OPTIONS:
71
+ references.append(
72
+ Reference(
73
+ output=Output(text=row[option]),
74
+ tags=[CORRECT_TAG] if option == row["Answer"] else [],
75
+ )
76
+ )
77
+ instance = Instance(
78
+ id=f"id{row_index}",
79
+ input=input,
80
+ references=references,
81
+ split=TEST_SPLIT,
82
+ )
83
+ instances.append(instance)
84
+
85
+ return instances
@@ -4,6 +4,7 @@ import random
4
4
  from collections import defaultdict
5
5
  from typing import Dict, List, Tuple, Optional, Union
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
8
9
  from helm.common.hierarchical_logger import hlog
9
10
  from helm.benchmark.scenarios.scenario import (
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  TRAIN_SPLIT,
14
15
  VALID_SPLIT,
15
16
  CORRECT_TAG,
17
+ ScenarioMetadata,
16
18
  make_rank_tag,
17
19
  make_relevance_tag,
18
20
  Input,
@@ -657,3 +659,31 @@ class MSMARCOScenario(Scenario):
657
659
  valid_instances = self.get_valid_instances()
658
660
 
659
661
  return train_instances + valid_instances
662
+
663
+ def get_metadata(self) -> ScenarioMetadata:
664
+ if self.track == self.REGULAR_TRACK:
665
+ return ScenarioMetadata(
666
+ name="msmarco_regular",
667
+ display_name="MS MARCO (regular track)",
668
+ short_display_name="MS MARCO (regular)",
669
+ description="The MS MARCO benchmark's regular track for passage retrieval in information "
670
+ "retrieval "
671
+ "[(https://microsoft.github.io/msmarco/)](https://microsoft.github.io/msmarco/).",
672
+ taxonomy=TaxonomyInfo(task="information retrieval", what="?", when="?", who="?", language="English"),
673
+ main_metric="RR@10",
674
+ main_split="valid",
675
+ )
676
+ elif self.track == self.TREC_TRACK:
677
+ return ScenarioMetadata(
678
+ name="msmarco_trec",
679
+ display_name="MS MARCO (TREC track)",
680
+ short_display_name="MS MARCO (TREC)",
681
+ description="The MS MARCO benchmark's deep learning TREC track for passage retrieval in "
682
+ "information retrieval "
683
+ "[(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).",
684
+ taxonomy=TaxonomyInfo(task="information retrieval", what="?", when="?", who="?", language="English"),
685
+ main_metric="NDCG@10",
686
+ main_split="valid",
687
+ )
688
+ else:
689
+ raise Exception(f"Unknown track {self.track}")
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import requests
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  TEST_SPLIT,
10
11
  Input,
11
12
  Output,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_directory_exists
14
16
 
@@ -142,3 +144,23 @@ class MTSamplesProceduresScenario(Scenario):
142
144
  print(f"Error processing {file_name}: {e}")
143
145
 
144
146
  return instances
147
+
148
+ def get_metadata(self):
149
+ return ScenarioMetadata(
150
+ name="mtsamples_procedures",
151
+ display_name="MTSamples Procedures",
152
+ description="MTSamples Procedures is a benchmark composed of transcribed operative notes, "
153
+ "focused on documenting surgical procedures. Each example presents a brief "
154
+ "patient case involving a surgical intervention, and the model is tasked with "
155
+ "generating a coherent and clinically accurate procedural summary or treatment "
156
+ "plan.",
157
+ taxonomy=TaxonomyInfo(
158
+ task="Text generation",
159
+ what="Document and extract information about medical procedures",
160
+ when="Post-procedure",
161
+ who="Clinician, Researcher",
162
+ language="English",
163
+ ),
164
+ main_metric="mtsamples_procedures_accuracy",
165
+ main_split="test",
166
+ )
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import requests
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  TEST_SPLIT,
10
11
  Input,
11
12
  Output,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_directory_exists
14
16
 
@@ -140,3 +142,23 @@ class MTSamplesReplicateScenario(Scenario):
140
142
  print(f"Error processing {file_name}: {e}")
141
143
 
142
144
  return instances
145
+
146
+ def get_metadata(self):
147
+ return ScenarioMetadata(
148
+ name="mtsamples_replicate",
149
+ display_name="MTSamples",
150
+ short_display_name="MTSamples",
151
+ description="MTSamples Replicate is a benchmark that provides transcribed medical reports "
152
+ "from various specialties. It is used to evaluate a model's ability to generate "
153
+ "clinically appropriate treatment plans based on unstructured patient "
154
+ "documentation [(MTSamples, 2025)](https://mtsamples.com).",
155
+ taxonomy=TaxonomyInfo(
156
+ task="Text generation",
157
+ what="Generate treatment plans based on clinical notes",
158
+ when="Post-diagnosis",
159
+ who="Clinician",
160
+ language="English",
161
+ ),
162
+ main_metric="mtsamples_replicate_accuracy",
163
+ main_split="test",
164
+ )
@@ -4,6 +4,7 @@ import re
4
4
  from typing import Any, Dict, List, Tuple, Optional
5
5
  import xml.etree.ElementTree as ET
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  TEST_SPLIT,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Reference,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  ORIGINAL_DEFINITIONS = {
@@ -275,3 +277,21 @@ class N2C2CTMatchingScenario(Scenario):
275
277
  )
276
278
 
277
279
  return instances
280
+
281
+ def get_metadata(self):
282
+ return ScenarioMetadata(
283
+ name="n2c2_ct_matching",
284
+ display_name="N2C2-CT Matching",
285
+ short_display_name="N2C2-CT",
286
+ description="A dataset that provides clinical notes and asks the model to classify whether "
287
+ "the patient is a valid candidate for a provided clinical trial.",
288
+ taxonomy=TaxonomyInfo(
289
+ task="Classification",
290
+ what="Classify whether a patient is a valid candidate for a clinical " "trial based on clinical notes",
291
+ when="Pre-Trial",
292
+ who="Researcher",
293
+ language="English",
294
+ ),
295
+ main_metric="exact_match",
296
+ main_split="test",
297
+ )
@@ -3,6 +3,7 @@ import random
3
3
  import csv
4
4
  from typing import List, Dict
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  Input,
14
15
  PassageQuestionInput,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -155,3 +157,21 @@ class NarrativeQAScenario(Scenario):
155
157
  instances.extend(self.get_split_instances(summaries_file=summaries_file, qaps_file=qaps_file, split=split))
156
158
 
157
159
  return instances
160
+
161
+ def get_metadata(self) -> ScenarioMetadata:
162
+ return ScenarioMetadata(
163
+ name="narrative_qa",
164
+ display_name="NarrativeQA",
165
+ short_display_name=None,
166
+ description="The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský "
167
+ "et al., 2017)](https://aclanthology.org/Q18-1023/).",
168
+ taxonomy=TaxonomyInfo(
169
+ task="question answering",
170
+ what="passages are books and movie scripts, questions are unknown",
171
+ when="?",
172
+ who="?",
173
+ language="English",
174
+ ),
175
+ main_metric="f1_score",
176
+ main_split="test",
177
+ )
@@ -6,6 +6,7 @@ import os
6
6
  import re
7
7
  import html
8
8
  import random
9
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
9
10
  from helm.common.hierarchical_logger import htrack_block, hlog
10
11
  from typing import List, Dict
11
12
 
@@ -20,6 +21,7 @@ from helm.benchmark.scenarios.scenario import (
20
21
  PassageQuestionInput,
21
22
  Input,
22
23
  Output,
24
+ ScenarioMetadata,
23
25
  )
24
26
 
25
27
 
@@ -324,3 +326,33 @@ class NaturalQAScenario(Scenario):
324
326
  instances.extend(self.get_file_instances(target_path))
325
327
 
326
328
  return instances
329
+
330
+ def get_metadata(self) -> ScenarioMetadata:
331
+ if self.context_mode == "closedbook":
332
+ name = "natural_qa_closedbook"
333
+ display_name = "NaturalQuestions (closed-book)"
334
+ elif self.context_mode == "openbook_longans":
335
+ name = "natural_qa_openbook_longans"
336
+ display_name = "NaturalQuestions (open-book)"
337
+ elif self.context_mode == "openbook_wiki":
338
+ name = "natural_qa_openbook_wiki"
339
+ display_name = "NaturalQuestions (open-book Wiki)"
340
+ else:
341
+ raise Exception(f"Unknown context_mode {self.context_mode}")
342
+ return ScenarioMetadata(
343
+ name=name,
344
+ display_name=display_name,
345
+ description="The NaturalQuestions [(Kwiatkowski et al., "
346
+ "2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering "
347
+ "based on naturally-occurring queries through Google Search. The input does not "
348
+ "include the Wikipedia page with the answer.",
349
+ taxonomy=TaxonomyInfo(
350
+ task="question answering",
351
+ what="passages from Wikipedia, questions from search queries",
352
+ when="2010s",
353
+ who="web users",
354
+ language="English",
355
+ ),
356
+ main_metric="f1_score",
357
+ main_split="valid",
358
+ )
@@ -1,6 +1,7 @@
1
1
  import datasets
2
2
  import os
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
6
7
  Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
9
10
  Input,
10
11
  Output,
11
12
  CORRECT_TAG,
13
+ ScenarioMetadata,
12
14
  )
13
15
  from helm.common.general import ensure_directory_exists
14
16
 
@@ -51,3 +53,19 @@ class OmniMATHScenario(Scenario):
51
53
  instances.append(instance)
52
54
 
53
55
  return instances
56
+
57
+ def get_metadata(self) -> ScenarioMetadata:
58
+ return ScenarioMetadata(
59
+ name=self.name,
60
+ display_name="Omni-MATH",
61
+ description=self.description,
62
+ main_metric="omni_math_accuracy",
63
+ main_split="test",
64
+ taxonomy=TaxonomyInfo(
65
+ task="mathematics",
66
+ what="universal Olympiad level mathematic benchmark",
67
+ who="human annotators",
68
+ when="2024",
69
+ language="English",
70
+ ),
71
+ )
@@ -2,6 +2,7 @@ from typing import List, Dict, Any, DefaultDict
2
2
  from datasets import load_dataset, Dataset
3
3
  from collections import defaultdict
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  CORRECT_TAG,
7
8
  Reference,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  TRAIN_SPLIT,
12
13
  VALID_SPLIT,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -126,3 +128,23 @@ class OpenAssistantScenario(Scenario):
126
128
  valid_instances = get_split_instances(dataset["validation"], VALID_SPLIT)
127
129
 
128
130
  return train_instances + valid_instances
131
+
132
+ def get_metadata(self) -> ScenarioMetadata:
133
+ return ScenarioMetadata(
134
+ name="open_assistant",
135
+ display_name="Open Assistant",
136
+ short_display_name="Open Assistant",
137
+ description="LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 "
138
+ "conversation trees ([Köpf et al., "
139
+ "2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial "
140
+ "prompt in each conversation.",
141
+ taxonomy=TaxonomyInfo(
142
+ task="open-ended instruction following",
143
+ what="Human-written dialogues and response rankings",
144
+ when="2023",
145
+ who="Open Assistant participants",
146
+ language="35 languages",
147
+ ),
148
+ main_metric="Helpfulness",
149
+ main_split="valid",
150
+ )