crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,162 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Accuracy metrics:
5
+ - name: exact_match
6
+ display_name: Exact match
7
+ short_display_name: EM
8
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
9
+ lower_is_better: false
10
+ - name: quasi_exact_match
11
+ display_name: Quasi-exact match
12
+ short_display_name: EM
13
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
14
+ lower_is_better: false
15
+
16
+ # Classification metrics:
17
+ - name: classification_macro_f1
18
+ display_name: Macro-F1
19
+ description: Population-level macro-averaged F1 score.
20
+ lower_is_better: false
21
+ - name: classification_micro_f1
22
+ display_name: Micro-F1
23
+ description: Population-level micro-averaged F1 score.
24
+ lower_is_better: false
25
+
26
+ # Speech-specific metrics:
27
+ - name: wer
28
+ display_name: Word Error Rate
29
+ short_display_name: WER
30
+ description: Word Error Rate for automatic speech recognition evaluation.
31
+ lower_is_better: true
32
+ - name: mer
33
+ display_name: Match Error Rate
34
+ short_display_name: MER
35
+ description: Match Error Rate for automatic speech recognition evaluation.
36
+ lower_is_better: true
37
+ - name: wip
38
+ display_name: Word Information Preserved
39
+ short_display_name: WIP
40
+ description: Word Information Preserved for automatic speech recognition evaluation.
41
+ lower_is_better: false
42
+
43
+ ############################################################
44
+ metric_groups:
45
+ - name: accuracy
46
+ display_name: Accuracy
47
+ metrics:
48
+ - name: ${main_name}
49
+ split: ${main_split}
50
+
51
+ - name: classification_metrics
52
+ display_name: Classification metrics
53
+ metrics:
54
+ - name: classification_macro_f1
55
+ split: ${main_split}
56
+ - name: classification_micro_f1
57
+ split: ${main_split}
58
+ - name: exact_match
59
+ split: ${main_split}
60
+
61
+ - name: speech_metrics
62
+ display_name: Speech metrics
63
+ metrics:
64
+ - name: wer
65
+ split: ${main_split}
66
+ - name: mer
67
+ split: ${main_split}
68
+ - name: wip
69
+ split: ${main_split}
70
+
71
+ #######################################################
72
+ run_groups:
73
+ - name: slphelm
74
+ display_name: SLPHelm Scenarios
75
+ description: Scenarios for speech language processing evaluation
76
+ category: All scenarios
77
+ subgroups:
78
+ - slphelm_disorder_diagnosis
79
+ - slphelm_asr_disorder_diagnosis
80
+ - slphelm_asr_transcription
81
+ - slphelm_disorder_type_classification
82
+ - slphelm_disorder_symptom_classification
83
+
84
+ - name: slphelm_disorder_diagnosis
85
+ display_name: Disorder Diagnosis
86
+ description: Speech-based disorder diagnosis evaluation using audio input.
87
+ category: Disorder Diagnosis
88
+ metric_groups:
89
+ - classification_metrics
90
+ environment:
91
+ main_name: classification_macro_f1
92
+ main_split: test
93
+ taxonomy:
94
+ task: disorder diagnosis
95
+ what: "Speech disorder classification from audio"
96
+ who: "Children with speech disorders"
97
+ when: "Clinical assessment"
98
+ language: English
99
+
100
+ - name: slphelm_asr_disorder_diagnosis
101
+ display_name: ASR-Based Disorder Diagnosis
102
+ description: Disorder diagnosis based on automatic speech recognition transcription accuracy.
103
+ category: ASR-Based Disorder Diagnosis
104
+ metric_groups:
105
+ - classification_metrics
106
+ environment:
107
+ main_name: classification_macro_f1
108
+ main_split: test
109
+ taxonomy:
110
+ task: asr-based disorder diagnosis
111
+ what: "Disorder classification from transcription accuracy"
112
+ who: "Children with speech disorders"
113
+ when: "Clinical assessment"
114
+ language: English
115
+
116
+ - name: slphelm_asr_transcription
117
+ display_name: ASR-Transcription
118
+ description: Automatic speech recognition transcription quality evaluation.
119
+ category: ASR-Transcription
120
+ metric_groups:
121
+ - speech_metrics
122
+ environment:
123
+ main_name: wer
124
+ main_split: test
125
+ taxonomy:
126
+ task: automatic speech recognition
127
+ what: "Speech transcription accuracy"
128
+ who: "Children with speech disorders"
129
+ when: "Clinical assessment"
130
+ language: English
131
+
132
+ - name: slphelm_disorder_type_classification
133
+ display_name: Disorder Type Classification
134
+ description: Classification of specific types of speech disorders.
135
+ category: Disorder Type Classification
136
+ metric_groups:
137
+ - classification_metrics
138
+ environment:
139
+ main_name: classification_macro_f1
140
+ main_split: test
141
+ taxonomy:
142
+ task: disorder type classification
143
+ what: "Specific speech disorder type identification"
144
+ who: "Children with speech disorders"
145
+ when: "Clinical assessment"
146
+ language: English
147
+
148
+ - name: slphelm_disorder_symptom_classification
149
+ display_name: Disorder Symptom Classification
150
+ description: Classification of specific symptoms within speech disorders.
151
+ category: Disorder Symptom Classification
152
+ metric_groups:
153
+ - classification_metrics
154
+ environment:
155
+ main_name: classification_macro_f1
156
+ main_split: test
157
+ taxonomy:
158
+ task: disorder symptom classification
159
+ what: "Speech disorder symptom identification"
160
+ who: "Children with speech disorders"
161
+ when: "Clinical assessment"
162
+ language: English
@@ -307,6 +307,8 @@ run_groups:
307
307
  description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
308
308
  category: Core scenarios
309
309
  subgroups:
310
+ - mmmu
311
+ - exams_v
310
312
  - gqa
311
313
  - math_vista
312
314
  - seed_bench
@@ -320,7 +322,6 @@ run_groups:
320
322
  category: Core scenarios
321
323
  subgroups:
322
324
  - a_okvqa_base
323
- - mmmu
324
325
  - mme
325
326
  - vibe_eval
326
327
  - mm_star_knowledge
@@ -369,7 +370,6 @@ run_groups:
369
370
  - a_okvqa_hindi
370
371
  - a_okvqa_spanish
371
372
  - a_okvqa_swahili
372
- - exams_v
373
373
  - bingo_multilinguality
374
374
  - name: a_okvqa_base
375
375
  display_name: A-OKVQA
@@ -378,7 +378,7 @@ run_groups:
378
378
  - accuracy
379
379
  - general_information
380
380
  environment:
381
- main_name: exact_match
381
+ main_name: quasi_prefix_exact_match
382
382
  main_split: valid
383
383
  taxonomy:
384
384
  task: multiple-choice question answering
@@ -394,7 +394,7 @@ run_groups:
394
394
  - fairness
395
395
  - general_information
396
396
  environment:
397
- main_name: exact_match
397
+ main_name: quasi_prefix_exact_match
398
398
  main_split: valid
399
399
  taxonomy:
400
400
  task: multiple-choice question answering
@@ -410,7 +410,7 @@ run_groups:
410
410
  - translate
411
411
  - general_information
412
412
  environment:
413
- main_name: exact_match
413
+ main_name: quasi_prefix_exact_match
414
414
  main_split: valid
415
415
  taxonomy:
416
416
  task: multiple-choice question answering
@@ -426,7 +426,7 @@ run_groups:
426
426
  - translate
427
427
  - general_information
428
428
  environment:
429
- main_name: exact_match
429
+ main_name: quasi_prefix_exact_match
430
430
  main_split: valid
431
431
  taxonomy:
432
432
  task: multiple-choice question answering
@@ -442,7 +442,7 @@ run_groups:
442
442
  - translate
443
443
  - general_information
444
444
  environment:
445
- main_name: exact_match
445
+ main_name: quasi_prefix_exact_match
446
446
  main_split: valid
447
447
  taxonomy:
448
448
  task: multiple-choice question answering
@@ -458,7 +458,7 @@ run_groups:
458
458
  - translate
459
459
  - general_information
460
460
  environment:
461
- main_name: exact_match
461
+ main_name: quasi_prefix_exact_match
462
462
  main_split: valid
463
463
  taxonomy:
464
464
  task: multiple-choice question answering
@@ -474,7 +474,7 @@ run_groups:
474
474
  - accuracy
475
475
  - general_information
476
476
  environment:
477
- main_name: exact_match
477
+ main_name: quasi_prefix_exact_match
478
478
  main_split: valid
479
479
  taxonomy:
480
480
  task: multiple-choice question answering
@@ -490,7 +490,7 @@ run_groups:
490
490
  - accuracy
491
491
  - general_information
492
492
  environment:
493
- main_name: exact_match
493
+ main_name: quasi_prefix_exact_match
494
494
  main_split: valid
495
495
  taxonomy:
496
496
  task: multiple-choice question answering
@@ -506,7 +506,7 @@ run_groups:
506
506
  - accuracy
507
507
  - general_information
508
508
  environment:
509
- main_name: exact_match
509
+ main_name: quasi_prefix_exact_match
510
510
  main_split: valid
511
511
  taxonomy:
512
512
  task: multiple-choice question answering
@@ -522,7 +522,7 @@ run_groups:
522
522
  - accuracy
523
523
  - general_information
524
524
  environment:
525
- main_name: exact_match
525
+ main_name: quasi_prefix_exact_match
526
526
  main_split: valid
527
527
  taxonomy:
528
528
  task: multiple-choice question answering
@@ -538,7 +538,7 @@ run_groups:
538
538
  - accuracy
539
539
  - general_information
540
540
  environment:
541
- main_name: exact_match
541
+ main_name: quasi_prefix_exact_match
542
542
  main_split: valid
543
543
  taxonomy:
544
544
  task: multiple-choice question answering
@@ -554,7 +554,7 @@ run_groups:
554
554
  - accuracy
555
555
  - general_information
556
556
  environment:
557
- main_name: exact_match
557
+ main_name: quasi_prefix_exact_match
558
558
  main_split: valid
559
559
  taxonomy:
560
560
  task: multiple-choice question answering
@@ -602,7 +602,7 @@ run_groups:
602
602
  - accuracy
603
603
  - general_information
604
604
  environment:
605
- main_name: quasi_exact_match
605
+ main_name: quasi_prefix_exact_match
606
606
  main_split: valid
607
607
  taxonomy:
608
608
  task: short-answer question answering
@@ -618,7 +618,7 @@ run_groups:
618
618
  - accuracy
619
619
  - general_information
620
620
  environment:
621
- main_name: exact_match
621
+ main_name: quasi_prefix_exact_match
622
622
  main_split: test
623
623
  taxonomy:
624
624
  task: toxicity identification
@@ -651,7 +651,7 @@ run_groups:
651
651
  - accuracy
652
652
  - general_information
653
653
  environment:
654
- main_name: quasi_exact_match
654
+ main_name: quasi_prefix_exact_match
655
655
  main_split: valid
656
656
  taxonomy:
657
657
  task: short-answer question answering
@@ -667,7 +667,7 @@ run_groups:
667
667
  - accuracy
668
668
  - general_information
669
669
  environment:
670
- main_name: quasi_exact_match
670
+ main_name: quasi_prefix_exact_match
671
671
  main_split: valid
672
672
  taxonomy:
673
673
  task: short-answer question answering
@@ -683,7 +683,7 @@ run_groups:
683
683
  - fairness
684
684
  - general_information
685
685
  environment:
686
- main_name: quasi_exact_match
686
+ main_name: quasi_prefix_exact_match
687
687
  main_split: valid
688
688
  taxonomy:
689
689
  task: short-answer question answering
@@ -715,7 +715,7 @@ run_groups:
715
715
  - accuracy
716
716
  - general_information
717
717
  environment:
718
- main_name: exact_match
718
+ main_name: quasi_prefix_exact_match
719
719
  main_split: valid
720
720
  taxonomy:
721
721
  task: multiple-choice question answering
@@ -795,7 +795,7 @@ run_groups:
795
795
  - accuracy
796
796
  - general_information
797
797
  environment:
798
- main_name: exact_match
798
+ main_name: quasi_prefix_exact_match
799
799
  main_split: test
800
800
  taxonomy:
801
801
  task: short-answer question answering
@@ -811,7 +811,7 @@ run_groups:
811
811
  - accuracy
812
812
  - general_information
813
813
  environment:
814
- main_name: exact_match
814
+ main_name: quasi_prefix_exact_match
815
815
  main_split: test
816
816
  taxonomy:
817
817
  task: multiple-choice question answering
@@ -827,7 +827,7 @@ run_groups:
827
827
  - accuracy
828
828
  - general_information
829
829
  environment:
830
- main_name: exact_match
830
+ main_name: quasi_prefix_exact_match
831
831
  main_split: test
832
832
  taxonomy:
833
833
  task: multiple-choice question answering
@@ -875,7 +875,7 @@ run_groups:
875
875
  - accuracy
876
876
  - general_information
877
877
  environment:
878
- main_name: exact_match
878
+ main_name: quasi_prefix_exact_match
879
879
  main_split: test
880
880
  taxonomy:
881
881
  task: multiple-choice question answering
@@ -891,7 +891,7 @@ run_groups:
891
891
  - accuracy
892
892
  - general_information
893
893
  environment:
894
- main_name: exact_match
894
+ main_name: quasi_prefix_exact_match
895
895
  main_split: valid
896
896
  taxonomy:
897
897
  task: multiple-choice question answering
@@ -923,7 +923,7 @@ run_groups:
923
923
  - accuracy
924
924
  - general_information
925
925
  environment:
926
- main_name: exact_match
926
+ main_name: quasi_prefix_exact_match
927
927
  main_split: test
928
928
  taxonomy:
929
929
  task: multiple-choice question answering
@@ -0,0 +1,219 @@
1
+ ---
2
+ ############################################################
3
+ perturbations: []
4
+ ############################################################
5
+ metrics:
6
+ - name: num_references
7
+ display_name: '# ref'
8
+ description: Number of references.
9
+ - name: num_train_trials
10
+ display_name: '# trials'
11
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
12
+ - name: estimated_num_tokens_cost
13
+ display_name: 'cost'
14
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
15
+ - name: num_prompt_tokens
16
+ display_name: '# prompt tokens'
17
+ description: Number of tokens in the prompt.
18
+ - name: num_prompt_characters
19
+ display_name: '# prompt chars'
20
+ description: Number of characters in the prompt.
21
+ - name: num_completion_tokens
22
+ display_name: '# completion tokens'
23
+ description: Actual number of completion tokens (over all completions).
24
+ - name: num_output_tokens
25
+ display_name: '# output tokens'
26
+ description: Actual number of output tokens.
27
+ - name: max_num_output_tokens
28
+ display_name: 'Max output tokens'
29
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
30
+ - name: num_requests
31
+ display_name: '# requests'
32
+ description: Number of distinct API requests.
33
+ - name: num_instances
34
+ display_name: '# eval'
35
+ description: Number of evaluation instances.
36
+ - name: num_train_instances
37
+ display_name: '# train'
38
+ description: Number of training instances (e.g., in-context examples).
39
+ - name: prompt_truncated
40
+ display_name: truncated
41
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
42
+ - name: finish_reason_length
43
+ display_name: finish b/c length
44
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
45
+ - name: finish_reason_stop
46
+ display_name: finish b/c stop
47
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
48
+ - name: finish_reason_endoftext
49
+ display_name: finish b/c endoftext
50
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
51
+ - name: finish_reason_unknown
52
+ display_name: finish b/c unknown
53
+ description: Fraction of instances where the the output was terminated for unknown reasons.
54
+ - name: num_completions
55
+ display_name: '# completions'
56
+ description: Number of completions.
57
+ - name: predicted_index
58
+ display_name: Predicted index
59
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
60
+
61
+ # Vision Language metrics [image]:
62
+ - name: earth_mover_similarity
63
+ display_name: Earth Mover Similarity
64
+ short_display_name: EMD-Sim
65
+ description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
66
+ lower_is_better: false
67
+ - name: pixel_similarity
68
+ display_name: Pixel Similarity
69
+ short_display_name: PS
70
+ description: Pixel Similarity between an image generated by the model and the target image.
71
+ lower_is_better: false
72
+ - name: sift_similarity
73
+ display_name: SIFT Similarity
74
+ short_display_name: SIFT
75
+ description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
76
+ lower_is_better: false
77
+ - name: compilation_success
78
+ display_name: Compilation success
79
+ description: Fraction of instances where the generated code compiles successfully.
80
+ lower_is_better: false
81
+ - name: lpips_similarity
82
+ display_name: LPIPS similarity
83
+ short_display_name: LPIPS
84
+ description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
85
+ lower_is_better: false
86
+ - name: fid_similarity
87
+ display_name: FID similarity
88
+ short_display_name: FID
89
+ description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
90
+ lower_is_better: false
91
+ - name: ssim_similarity
92
+ display_name: SSIM
93
+ short_display_name: SSIM
94
+ description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
95
+ lower_is_better: false
96
+
97
+ # Accuracy metrics:
98
+ - name: exact_match
99
+ display_name: Exact match
100
+ short_display_name: EM
101
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
102
+ lower_is_better: false
103
+ - name: quasi_exact_match
104
+ display_name: Quasi-exact match
105
+ short_display_name: EM
106
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
107
+ lower_is_better: false
108
+ - name: quasi_leave_articles_exact_match
109
+ display_name: Quasi-exact match
110
+ short_display_name: EM
111
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
112
+ lower_is_better: false
113
+ - name: prefix_exact_match
114
+ display_name: Prefix exact match
115
+ short_display_name: PEM
116
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
117
+ lower_is_better: false
118
+ - name: quasi_prefix_exact_match
119
+ # TODO: should call this prefix_quasi_exact_match
120
+ display_name: Prefix quasi-exact match
121
+ short_display_name: PEM
122
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
123
+ lower_is_better: false
124
+
125
+ - name: rouge_1
126
+ display_name: ROUGE-1
127
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
128
+ lower_is_better: false
129
+ - name: rouge_2
130
+ display_name: ROUGE-2
131
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
132
+ lower_is_better: false
133
+ - name: rouge_l
134
+ display_name: ROUGE-L
135
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
136
+ lower_is_better: false
137
+ - name: bleu_1
138
+ display_name: BLEU-1
139
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
140
+ lower_is_better: false
141
+ - name: bleu_4
142
+ display_name: BLEU-4
143
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
144
+ lower_is_better: false
145
+ - name: f1_score
146
+ display_name: F1
147
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
148
+ lower_is_better: false
149
+ - name: cider
150
+ display_name: CIDEr
151
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
152
+ lower_is_better: false
153
+ - name: prometheus_vision
154
+ display_name: Prometheus Vision rating
155
+ description: Scores are from 1 to 5, where 5 is the best.
156
+ lower_is_better: false
157
+
158
+ # Toxicity metrics
159
+ - name: expected_max_toxicity
160
+ display_name: Average max toxicity
161
+ short_display_name: Avg. max toxicity
162
+ lower_is_better: true
163
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
164
+ - name: max_toxicity_probability
165
+ display_name: max toxicity
166
+ lower_is_better: true
167
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
168
+ - name: toxic_frac
169
+ display_name: Toxic fraction
170
+ lower_is_better: true
171
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
172
+
173
+ ############################################################
174
+ metric_groups:
175
+ - name: accuracy
176
+ display_name: Accuracy
177
+ metrics:
178
+ - name: ${main_name}
179
+ split: ${main_split}
180
+
181
+ - name: general_information
182
+ display_name: General information
183
+ metrics:
184
+ - name: num_instances
185
+ split: ${main_split}
186
+ - name: num_train_instances
187
+ split: ${main_split}
188
+ - name: prompt_truncated
189
+ split: ${main_split}
190
+ - name: num_prompt_tokens
191
+ split: ${main_split}
192
+ - name: num_output_tokens
193
+ split: ${main_split}
194
+
195
+
196
+ ############################################################
197
+ run_groups:
198
+ - name: core_scenarios
199
+ display_name: All
200
+ description: All scenarios across capabilities
201
+ category: All scenarios
202
+ subgroups:
203
+ - msr_vtt
204
+
205
+ - name: msr_vtt
206
+ display_name: MST-VTT
207
+ description: A large-scale video benchmark for video understanding, especially the emerging task of translating video to text.
208
+ metric_groups:
209
+ - accuracy
210
+ - general_information
211
+ environment:
212
+ main_name: f1_score
213
+ main_split: test
214
+ taxonomy:
215
+ task: captioning
216
+ what: Real-world videos
217
+ who: Human experts
218
+ when: "2016"
219
+ language: English