crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,13 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
8
9
  TEST_SPLIT,
9
10
  Input,
11
+ ScenarioMetadata,
10
12
  )
11
13
  from helm.common.general import ensure_directory_exists
12
14
 
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
81
83
  instances.append(instance)
82
84
 
83
85
  return instances
86
+
87
+ def get_metadata(self) -> ScenarioMetadata:
88
+ return ScenarioMetadata(
89
+ name=self.name,
90
+ display_name="WildBench",
91
+ description=self.description,
92
+ main_metric="wildbench_score_rescaled",
93
+ main_split="test",
94
+ taxonomy=TaxonomyInfo(
95
+ task="instruction following",
96
+ what="GPT-judged instruction following with instructions collected from real-user conversations",
97
+ who="real-world users",
98
+ when="2024",
99
+ language="English",
100
+ ),
101
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import List, Any
2
2
  from datasets import load_dataset
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.common.hierarchical_logger import htrack_block
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -106,3 +108,13 @@ class WMT14Scenario(Scenario):
106
108
  )
107
109
  )
108
110
  return instances
111
+
112
+ def get_metadata(self) -> ScenarioMetadata:
113
+ return ScenarioMetadata(
114
+ name="wmt_14",
115
+ display_name="WMT 2014",
116
+ description="WMT 2014 is a collection of machine translation datasets.",
117
+ taxonomy=TaxonomyInfo(task="machine translation", what="n/a", when="n/a", who="n/a", language="English"),
118
+ main_metric="bleu_4",
119
+ main_split="test",
120
+ )
@@ -92,6 +92,12 @@ metrics:
92
92
  short_display_name: PEM
93
93
  description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
94
94
  lower_is_better: false
95
+ - name: alrage_score
96
+ # TODO: should call this prefix_quasi_exact_match
97
+ display_name: ALRAGE Score
98
+ short_display_name: Score
99
+ description: Score of the output judged by GPT-4o.
100
+ lower_is_better: false
95
101
 
96
102
  ############################################################
97
103
  perturbations: []
@@ -134,17 +140,20 @@ run_groups:
134
140
  - name: arabic_scenarios
135
141
  display_name: Arabic Scenarios
136
142
  description: Arabic Scenarios
137
- category: All scenarios
143
+ category: Scenarios
138
144
  subgroups:
139
- - mmmlu
140
- - arabic_mmlu
141
145
  - alghafa
142
- - exams_multilingual
146
+ - arabic_mmlu
147
+ - arabic_exams
148
+ - madinah_qa
143
149
  - aratrust
150
+ - alrage
151
+ - mbzuai_human_translated_arabic_mmlu
144
152
 
145
- - name: mmmlu
146
- display_name: Multilingual MMLU (Arabic)
147
- description: Multilingual MMLU (Arabic)
153
+ - name: mbzuai_human_translated_arabic_mmlu
154
+ display_name: MBZUAI Human-Translated Arabic MMLU
155
+ short_display_name: Translated MMLU
156
+ description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
148
157
  metric_groups:
149
158
  - accuracy
150
159
  - efficiency
@@ -160,8 +169,8 @@ run_groups:
160
169
  language: Arabic
161
170
 
162
171
  - name: arabic_mmlu
163
- display_name: Arabic MMLU
164
- description: Arabic MMLU
172
+ display_name: ArabicMMLU
173
+ description: ArabicMMLU
165
174
  metric_groups:
166
175
  - accuracy
167
176
  - efficiency
@@ -193,9 +202,9 @@ run_groups:
193
202
  when: "before 2023"
194
203
  language: Arabic
195
204
 
196
- - name: exams_multilingual
197
- display_name: EXAMS (Arabic)
198
- description: EXAMS (Arabic)
205
+ - name: arabic_exams
206
+ display_name: Arabic EXAMS
207
+ description: Arabic EXAMS
199
208
  metric_groups:
200
209
  - accuracy
201
210
  - efficiency
@@ -226,3 +235,37 @@ run_groups:
226
235
  who: "academic exams writers and takers"
227
236
  when: "before 2024"
228
237
  language: Arabic
238
+
239
+ - name: alrage
240
+ display_name: ALRAGE
241
+ description: ALRAGE
242
+ metric_groups:
243
+ - accuracy
244
+ - efficiency
245
+ - general_information
246
+ environment:
247
+ main_name: alrage_score
248
+ main_split: test
249
+ taxonomy:
250
+ task: "openbook (RAG) open-ended question answering"
251
+ what: "?"
252
+ who: "?"
253
+ when: "?"
254
+ language: Arabic
255
+
256
+ - name: madinah_qa
257
+ display_name: MadinahQA
258
+ description: Arabic language competency benchmark
259
+ metric_groups:
260
+ - accuracy
261
+ - efficiency
262
+ - general_information
263
+ environment:
264
+ main_name: exact_match
265
+ main_split: test
266
+ taxonomy:
267
+ task: "question answering"
268
+ what: "academic questions about Arabic language"
269
+ who: "academic exams writers and takers"
270
+ when: "before 2024"
271
+ language: Arabic
@@ -194,7 +194,7 @@ run_groups:
194
194
  - ruler_hotpotqa
195
195
  - ruler_squad
196
196
  - infinite_bench_en_sum
197
- - infinite_bench_en_qa
197
+ # - infinite_bench_en_qa
198
198
  - infinite_bench_en_mc
199
199
  - openai_mrcr
200
200
 
@@ -233,22 +233,22 @@ run_groups:
233
233
  when: Before 2018
234
234
  language: English
235
235
 
236
- - name: infinite_bench_en_qa
237
- display_name: ∞Bench En.QA
238
- description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
239
- metric_groups:
240
- - accuracy
241
- - general_information
242
- - annotation_metrics
243
- environment:
244
- main_name: f1_score
245
- main_split: test
246
- taxonomy:
247
- task: question answering
248
- what: Novels
249
- who: Novel authors
250
- when: Before 2024
251
- language: English
236
+ # - name: infinite_bench_en_qa
237
+ # display_name: ∞Bench En.QA
238
+ # description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
239
+ # metric_groups:
240
+ # - accuracy
241
+ # - general_information
242
+ # - annotation_metrics
243
+ # environment:
244
+ # main_name: f1_score
245
+ # main_split: test
246
+ # taxonomy:
247
+ # task: question answering
248
+ # what: Novels
249
+ # who: Novel authors
250
+ # when: Before 2024
251
+ # language: English
252
252
 
253
253
  - name: infinite_bench_en_mc
254
254
  display_name: ∞Bench En.MC
@@ -484,6 +484,8 @@ run_groups:
484
484
  - ehrshot
485
485
  - head_qa
486
486
  - medbullets
487
+ - med_qa
488
+ - med_mcqa
487
489
  - medalign
488
490
  - shc_ptbm_med
489
491
  - shc_sei_med
@@ -657,6 +659,40 @@ run_groups:
657
659
  when: Any
658
660
  language: English
659
661
 
662
+ - name: med_qa
663
+ display_name: MedQA
664
+ description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
665
+ metric_groups:
666
+ - accuracy
667
+ - efficiency
668
+ - general_information
669
+ environment:
670
+ main_name: exact_match
671
+ main_split: test
672
+ taxonomy:
673
+ task: question answering
674
+ what: n/a
675
+ who: n/a
676
+ when: n/a
677
+ language: English
678
+
679
+ - name: med_mcqa
680
+ display_name: MedMCQA
681
+ description: MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address real-world medical entrance exam questions ([Flores et al. 2020](https://arxiv.org/abs/2203.14371)).
682
+ metric_groups:
683
+ - accuracy
684
+ - efficiency
685
+ - general_information
686
+ environment:
687
+ main_name: exact_match
688
+ main_split: valid
689
+ taxonomy:
690
+ task: question answering
691
+ what: n/a
692
+ who: n/a
693
+ when: n/a
694
+ language: English
695
+
660
696
  - name: medalign
661
697
  display_name: MedAlign
662
698
  short_display_name: MedAlign
@@ -0,0 +1,219 @@
1
+ ############################################################
2
+ metrics:
3
+ # Infrastructure metrics:
4
+ - name: num_perplexity_tokens
5
+ display_name: '# tokens'
6
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
7
+ - name: num_bytes
8
+ display_name: '# bytes'
9
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
10
+
11
+ - name: num_references
12
+ display_name: '# ref'
13
+ description: Number of references.
14
+ - name: num_train_trials
15
+ display_name: '# trials'
16
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
17
+ - name: estimated_num_tokens_cost
18
+ display_name: 'cost'
19
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
20
+ - name: num_prompt_tokens
21
+ display_name: '# prompt tokens'
22
+ description: Number of tokens in the prompt.
23
+ - name: num_prompt_characters
24
+ display_name: '# prompt chars'
25
+ description: Number of characters in the prompt.
26
+ - name: num_completion_tokens
27
+ display_name: '# completion tokens'
28
+ description: Actual number of completion tokens (over all completions).
29
+ - name: num_output_tokens
30
+ display_name: '# output tokens'
31
+ description: Actual number of output tokens.
32
+ - name: max_num_output_tokens
33
+ display_name: 'Max output tokens'
34
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
35
+ - name: num_requests
36
+ display_name: '# requests'
37
+ description: Number of distinct API requests.
38
+ - name: num_instances
39
+ display_name: '# eval'
40
+ description: Number of evaluation instances.
41
+ - name: num_train_instances
42
+ display_name: '# train'
43
+ description: Number of training instances (e.g., in-context examples).
44
+ - name: prompt_truncated
45
+ display_name: truncated
46
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
47
+ - name: finish_reason_length
48
+ display_name: finish b/c length
49
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
50
+ - name: finish_reason_stop
51
+ display_name: finish b/c stop
52
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
53
+ - name: finish_reason_endoftext
54
+ display_name: finish b/c endoftext
55
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
56
+ - name: finish_reason_unknown
57
+ display_name: finish b/c unknown
58
+ description: Fraction of instances where the the output was terminated for unknown reasons.
59
+ - name: num_completions
60
+ display_name: '# completions'
61
+ description: Number of completions.
62
+ - name: predicted_index
63
+ display_name: Predicted index
64
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
65
+
66
+ # Accuracy metrics:
67
+ - name: exact_match
68
+ display_name: Exact match
69
+ short_display_name: EM
70
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
71
+ lower_is_better: false
72
+ - name: classification_macro_f1
73
+ display_name: Macro-F1
74
+ description: Population-level macro-averaged F1 score.
75
+ lower_is_better: false
76
+ - name: classification_micro_f1
77
+ display_name: Micro-F1
78
+ description: Population-level micro-averaged F1 score.
79
+ lower_is_better: false
80
+ - name: wer_score
81
+ display_name: Word Error Rate
82
+ description: Transcription error rate.
83
+ lower_is_better: true
84
+ - name: mer_score
85
+ display_name: Character Error Rate
86
+ description: Character error rate.
87
+ lower_is_better: true
88
+
89
+ ############################################################
90
+ perturbations: []
91
+
92
+ ############################################################
93
+ metric_groups:
94
+ - name: accuracy
95
+ display_name: Accuracy
96
+ hide_win_rates: true
97
+ metrics:
98
+ - name: exact_match
99
+ split: ${main_split}
100
+ - name: classification_macro_f1
101
+ split: ${main_split}
102
+ - name: classification_micro_f1
103
+ split: ${main_split}
104
+
105
+ - name: transcription_accuracy
106
+ display_name: Transcription Accuracy
107
+ hide_win_rates: true
108
+ metrics:
109
+ - name: wer_score
110
+ split: ${main_split}
111
+ - name: mer_score
112
+ split: ${main_split}
113
+
114
+ - name: efficiency
115
+ display_name: Efficiency
116
+ metrics:
117
+ - name: inference_runtime
118
+ split: ${main_split}
119
+
120
+ - name: general_information
121
+ display_name: General information
122
+ hide_win_rates: true
123
+ metrics:
124
+ - name: num_instances
125
+ split: ${main_split}
126
+ - name: num_train_instances
127
+ split: ${main_split}
128
+ - name: prompt_truncated
129
+ split: ${main_split}
130
+ - name: num_prompt_tokens
131
+ split: ${main_split}
132
+ - name: num_output_tokens
133
+ split: ${main_split}
134
+
135
+ ############################################################
136
+
137
+ run_groups:
138
+ - name: slp
139
+ display_name: SLP Scenarios
140
+ description: SLP-language scenarios
141
+ category: All scenarios
142
+ subgroups:
143
+ - disorder_diagnosis
144
+ - transcription
145
+ - symptom_diagnosis
146
+ - disorder_type_diagnosis
147
+
148
+
149
+ - name: disorder_diagnosis
150
+ display_name: Disorder Diagnosis Accuracy
151
+ description: >
152
+ Macro-averaged accuracy on disorder diagnosis for pediatric speech disorder.
153
+ metric_groups:
154
+ - accuracy
155
+ - efficiency
156
+ - general_information
157
+ environment:
158
+ main_name: classification_micro_f1
159
+ main_split: test
160
+ taxonomy:
161
+ task: classification
162
+ what: n/a
163
+ who: n/a
164
+ when: "?"
165
+ language: English
166
+
167
+ - name: transcription
168
+ display_name: Transcription Accuracy
169
+ description: >
170
+ Model transcription accuracy on understanding disordered pediatric speech
171
+ metric_groups:
172
+ - transcription_accuracy
173
+ - efficiency
174
+ - general_information
175
+ environment:
176
+ main_name: wer_score
177
+ main_split: test
178
+ taxonomy:
179
+ task: transcription
180
+ what: disordered pediatric speech
181
+ who: n/a
182
+ when: "?"
183
+ language: English
184
+
185
+ - name: symptom_diagnosis
186
+ display_name: Symptom Diagnosis Accuracy
187
+ description: >
188
+ Macro-averaged accuracy on symptom diagnosis for pediatric speech disorder.
189
+ metric_groups:
190
+ - accuracy
191
+ - efficiency
192
+ - general_information
193
+ environment:
194
+ main_name: classification_micro_f1
195
+ main_split: test
196
+ taxonomy:
197
+ task: classification
198
+ what: n/a
199
+ who: n/a
200
+ when: "?"
201
+ language: English
202
+
203
+ - name: disorder_type_diagnosis
204
+ display_name: Disorder Type Diagnosis Accuracy
205
+ description: >
206
+ Macro-averaged accuracy on disorder type diagnosis for pediatric speech disorder.
207
+ metric_groups:
208
+ - accuracy
209
+ - efficiency
210
+ - general_information
211
+ environment:
212
+ main_name: classification_micro_f1
213
+ main_split: test
214
+ taxonomy:
215
+ task: classification
216
+ what: n/a
217
+ who: n/a
218
+ when: "?"
219
+ language: English