crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -194,6 +194,11 @@ metrics:
194
194
  lower_is_better: false
195
195
  description: Judgements by GPT-4o
196
196
 
197
+ - name: refusal_rate
198
+ display_name: Refusal rate for safety
199
+ lower_is_better: false
200
+ description: Judgements by GPT-4o
201
+
197
202
  ############################################################
198
203
  perturbations: []
199
204
 
@@ -235,6 +240,7 @@ run_groups:
235
240
  subgroups:
236
241
  - auditory_perception
237
242
  - emotion_detection
243
+ - reasoning
238
244
  - knowledge
239
245
  - robustness
240
246
  - multilinguality
@@ -261,12 +267,20 @@ run_groups:
261
267
  - meld_audio
262
268
  - mustard
263
269
 
270
+ - name: reasoning
271
+ display_name: Reasoning
272
+ description: Reasoning about the audio
273
+ category: Core scenarios
274
+ subgroups:
275
+ - air_bench_chat_reasoning
276
+ - corebench
277
+
264
278
  - name: knowledge
265
279
  display_name: Knowledge
266
280
  description: Recalling facts or information contained in the audio LLM.
267
281
  category: Core scenarios
268
282
  subgroups:
269
- - air_bench_chat
283
+ - air_bench_chat_knowledge
270
284
  - air_bench_foundation
271
285
 
272
286
  - name: robustness
@@ -349,7 +363,7 @@ run_groups:
349
363
  - accuracy
350
364
  - general_information
351
365
  environment:
352
- main_name: quasi_exact_match
366
+ main_name: quasi_prefix_exact_match
353
367
  main_split: test
354
368
  taxonomy:
355
369
  task: audio classification
@@ -458,7 +472,7 @@ run_groups:
458
472
  - accuracy
459
473
  - general_information
460
474
  environment:
461
- main_name: exact_match
475
+ main_name: quasi_exact_match
462
476
  main_split: test
463
477
  taxonomy:
464
478
  task: audio identification
@@ -491,28 +505,6 @@ run_groups:
491
505
  when: "2024"
492
506
  language: English, Spanish
493
507
 
494
- - name: audio_pairs
495
- display_name: Audio PAIRS
496
- description: >
497
- Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
498
- racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
499
- clips using OpenAI's TTS-1-HD API. This dataset is also modified to add an option to opt-out with
500
- "unclear" as a choice.
501
-
502
- The dataset contains the audio and question for three subsets: occupation, status, and potential_crime.
503
- metric_groups:
504
- - accuracy
505
- - general_information
506
- environment:
507
- main_name: exact_match
508
- main_split: test
509
- taxonomy:
510
- task: audio classification
511
- what: audio and question of audio samples to examine models' gender and racial bias
512
- who: OpenAI's TTS-1-HD
513
- when: "2024"
514
- language: English
515
-
516
508
  - name: meld_audio
517
509
  display_name: Multimodal EmotionLines Dataset (MELD) Audio
518
510
  description: >
@@ -524,7 +516,7 @@ run_groups:
524
516
  - accuracy
525
517
  - general_information
526
518
  environment:
527
- main_name: quasi_exact_match
519
+ main_name: quasi_prefix_exact_match
528
520
  main_split: test
529
521
  taxonomy:
530
522
  task: audio classification
@@ -533,33 +525,30 @@ run_groups:
533
525
  when: "2018"
534
526
  language: English
535
527
 
536
- - name: casual_conversations2
537
- display_name: Casual Conversations 2
528
+ - name: air_bench_chat_knowledge
529
+ display_name: Air-Bench Chat (knowledge subsets)
538
530
  description: >
539
- Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
540
- The videos feature paid individuals who agreed to participate in the project and explicitly provided
541
- Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
542
- themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
543
- and Vietnam with a diverse set of adults in various categories.
544
-
545
- The dataset contains two classification tasks: age and gender classification
546
- ([Porgali et al., 2023](https://arxiv.org/abs/2303.04838)). We phrase these two tasks as the multi-choice
547
- questions answering task.
531
+ Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
532
+ approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
533
+ We consider the chat benchmark in this scenario.
534
+
535
+ The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed.
536
+ ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
548
537
  metric_groups:
549
538
  - accuracy
550
539
  - general_information
551
540
  environment:
552
- main_name: exact_match
541
+ main_name: gpt4_audio_critique
553
542
  main_split: test
554
543
  taxonomy:
555
- task: audio classification
556
- what: audio, spoken language, speaker's gender, age information of audio samples
544
+ task: audio question answering
545
+ what: audio, question, and answer of audio samples
557
546
  who: real speakers
558
- when: "2023"
559
- language: 10 languages
547
+ when: "2024"
548
+ language: English
560
549
 
561
- - name: air_bench_chat
562
- display_name: Air-Bench Chat
550
+ - name: air_bench_chat_reasoning
551
+ display_name: Air-Bench Chat (reasoning subsets)
563
552
  description: >
564
553
  Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
565
554
  approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
@@ -594,7 +583,7 @@ run_groups:
594
583
  - accuracy
595
584
  - general_information
596
585
  environment:
597
- main_name: exact_match
586
+ main_name: quasi_exact_match
598
587
  main_split: test
599
588
  taxonomy:
600
589
  task: audio question answering
@@ -612,7 +601,7 @@ run_groups:
612
601
  - accuracy
613
602
  - general_information
614
603
  environment:
615
- main_name: exact_match
604
+ main_name: quasi_exact_match
616
605
  main_split: test
617
606
  taxonomy:
618
607
  task: toxicity detection
@@ -631,7 +620,7 @@ run_groups:
631
620
  - accuracy
632
621
  - general_information
633
622
  environment:
634
- main_name: exact_match
623
+ main_name: quasi_exact_match
635
624
  main_split: test
636
625
  taxonomy:
637
626
  task: sarcasm detection
@@ -648,7 +637,7 @@ run_groups:
648
637
  - accuracy
649
638
  - general_information
650
639
  environment:
651
- main_name: toxic_frac
640
+ main_name: refusal_rate
652
641
  main_split: test
653
642
  taxonomy:
654
643
  task: refusal for safety
@@ -742,11 +731,33 @@ run_groups:
742
731
  - accuracy
743
732
  - general_information
744
733
  environment:
745
- main_name: exact_match
734
+ main_name: quasi_exact_match
746
735
  main_split: test
747
736
  taxonomy:
748
737
  task: audio classification
749
738
  what: audio, question and answer given the audio
750
739
  who: OpenAI's TTS
751
740
  when: "2025"
741
+ language: English
742
+
743
+ - name: corebench
744
+ display_name: COREBench
745
+ description: >
746
+ The COREBench is a new audio benchmark incorporating multi-speaker conversations.
747
+ It consists of conversational audio, transcript, question, and answer. There
748
+ are two challenging features of this benchmark: (1) the questions are designed
749
+ to require reasoning over multiple turns of conversation, and (2) the average
750
+ audio length is longer than 1 minute, which is significantly longer than
751
+ existing benchmarks.
752
+ metric_groups:
753
+ - accuracy
754
+ - general_information
755
+ environment:
756
+ main_name: quasi_prefix_exact_match
757
+ main_split: test
758
+ taxonomy:
759
+ task: audio question-answering
760
+ what: audio, question, transcripts and answer given the audio
761
+ who: OpenAI's TTS
762
+ when: "2025"
752
763
  language: English
@@ -1683,23 +1683,6 @@ run_groups:
1683
1683
  when: n/a
1684
1684
  language: synthetic
1685
1685
 
1686
- - name: numeracy
1687
- display_name: Numerical reasoning
1688
- description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
1689
- metric_groups:
1690
- - accuracy
1691
- - efficiency
1692
- - general_information
1693
- environment:
1694
- main_name: absolute_value_difference
1695
- main_split: test
1696
- taxonomy:
1697
- task: next-word prediction
1698
- what: Dyck formal language
1699
- who: n/a
1700
- when: n/a
1701
- language: synthetic
1702
-
1703
1686
  - name: synthetic_reasoning
1704
1687
  display_name: Synthetic reasoning (abstract symbols)
1705
1688
  description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
@@ -76,6 +76,10 @@ metrics:
76
76
  display_name: Float Equivalence
77
77
  description: Float Equivalence
78
78
  lower_is_better: false
79
+ - name: adjusted_macro_f1_score
80
+ display_name: Adjusted Macro F1 Score
81
+ short_display_name: Adjusted Macro F1 Score
82
+ description: Entity type classification F1 score, adjusted for partial matches following the KPI-Edgar paper, macro-averaged across entity types
79
83
 
80
84
  ############################################################
81
85
  perturbations: []
@@ -119,6 +123,7 @@ run_groups:
119
123
  - gold_commodity_news
120
124
  - financial_phrasebank
121
125
  - conv_fin_qa_calc
126
+ - kpi_edgar
122
127
 
123
128
  - name: legal_scenarios
124
129
  display_name: Legal Scenarios
@@ -195,6 +200,22 @@ run_groups:
195
200
  when: 2000-2019
196
201
  language: English
197
202
 
203
+ - name: kpi_edgar
204
+ display_name: KPI-EDGAR Financial Documents (Named Entity Recognition)
205
+ description: A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).
206
+ metric_groups:
207
+ - accuracy
208
+ - general_information
209
+ environment:
210
+ main_name: adjusted_macro_f1_score
211
+ main_split: test
212
+ taxonomy:
213
+ task: named entity recognition
214
+ what: financial reports
215
+ who: financial experts
216
+ when: before 2022
217
+ language: English
218
+
198
219
  - name: legal_contract_summarization
199
220
  display_name: Legal Contract Summarization
200
221
  description: Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).
@@ -94,6 +94,14 @@ metrics:
94
94
  display_name: ROUGE-L
95
95
  description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
96
96
  lower_is_better: false
97
+ - name: ruler_string_match_part
98
+ display_name: RULER String Match
99
+ description: RULER String Match
100
+ lower_is_better: false
101
+ - name: openai_mrcr_accuracy
102
+ display_name: MRCR Accuracy
103
+ description: MRCR Accuracy
104
+ lower_is_better: false
97
105
 
98
106
  # Toxicity metrics
99
107
  - name: expected_max_toxicity
@@ -180,61 +188,114 @@ metric_groups:
180
188
  run_groups:
181
189
  - name: long_context_scenarios
182
190
  display_name: Long Context Scenarios
183
- description: Scenarios for the model safety
191
+ description: Scenarios for evaluating long context capabilities
184
192
  category: All scenarios
185
193
  subgroups:
186
194
  - ruler_hotpotqa
187
195
  - ruler_squad
188
- - infinite_bench_sum
196
+ - infinite_bench_en_sum
197
+ - infinite_bench_en_qa
198
+ - infinite_bench_en_mc
199
+ - openai_mrcr
189
200
 
190
201
  - name: ruler_hotpotqa
191
202
  display_name: RULER HotPotQA
192
- description: RULER HotPotQA
203
+ description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
193
204
  metric_groups:
194
205
  - accuracy
195
206
  - general_information
196
207
  - annotation_metrics
197
208
  environment:
198
- main_name: f1_score
209
+ main_name: ruler_string_match_part
199
210
  main_split: valid
200
211
  taxonomy:
201
- task: question answering
202
- what: n/a
203
- who: n/a
204
- when: n/a
212
+ task: question answering with retrieval-augmented generation
213
+ what: Wikipedia articles
214
+ who: Wikipedia authors
215
+ when: Before 2018
205
216
  language: English
206
217
 
207
218
 
208
219
  - name: ruler_squad
209
220
  display_name: RULER SQuAD
210
- description: RULER SQuAD
221
+ description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
211
222
  metric_groups:
212
223
  - accuracy
213
224
  - general_information
214
225
  - annotation_metrics
215
226
  environment:
216
- main_name: f1_score
227
+ main_name: ruler_string_match_part
217
228
  main_split: valid
218
229
  taxonomy:
219
230
  task: question answering
220
- what: n/a
221
- who: n/a
222
- when: n/a
231
+ what: Wikipedia articles
232
+ who: Wikipedia authors and crowdworkers
233
+ when: Before 2018
223
234
  language: English
224
235
 
225
- - name: infinite_bench_sum
226
- display_name: ∞Bench Sum
227
- description: ∞Bench Sum
236
+ - name: infinite_bench_en_qa
237
+ display_name: ∞Bench En.QA
238
+ description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
228
239
  metric_groups:
229
240
  - accuracy
230
241
  - general_information
231
242
  - annotation_metrics
232
243
  environment:
233
- main_name: rouge_l
244
+ main_name: f1_score
234
245
  main_split: test
235
246
  taxonomy:
236
247
  task: question answering
237
- what: n/a
238
- who: n/a
239
- when: n/a
248
+ what: Novels
249
+ who: Novel authors
250
+ when: Before 2024
251
+ language: English
252
+
253
+ - name: infinite_bench_en_mc
254
+ display_name: ∞Bench En.MC
255
+ description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
256
+ metric_groups:
257
+ - accuracy
258
+ - general_information
259
+ - annotation_metrics
260
+ environment:
261
+ main_name: exact_match
262
+ main_split: test
263
+ taxonomy:
264
+ task: multiple-choice question answering
265
+ what: Novels
266
+ who: Novel authors
267
+ when: Before 2024
268
+ language: English
269
+
270
+ - name: infinite_bench_en_sum
271
+ display_name: ∞Bench En.Sum
272
+ description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
273
+ metric_groups:
274
+ - accuracy
275
+ - general_information
276
+ - annotation_metrics
277
+ environment:
278
+ main_name: rouge_l
279
+ main_split: test
280
+ taxonomy:
281
+ task: multi-hop question answering
282
+ what: Novels
283
+ who: Novel authors
284
+ when: Before 2024
285
+ language: English
286
+
287
+ - name: openai_mrcr
288
+ display_name: OpenAI MRCR
289
+ description: OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).
290
+ metric_groups:
291
+ - accuracy
292
+ - general_information
293
+ environment:
294
+ main_name: openai_mrcr_accuracy
295
+ main_split: test
296
+ taxonomy:
297
+ task: MRCR
298
+ what: Synthetic data
299
+ who: "None"
300
+ when: "2025"
240
301
  language: English