crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,752 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: quasi_exact_match
74
+ display_name: Quasi-exact match
75
+ short_display_name: EM
76
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
77
+ lower_is_better: false
78
+ - name: prefix_exact_match
79
+ display_name: Prefix exact match
80
+ short_display_name: PEM
81
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
82
+ lower_is_better: false
83
+ - name: quasi_prefix_exact_match
84
+ # TODO: should call this prefix_quasi_exact_match
85
+ display_name: Prefix quasi-exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
88
+ lower_is_better: false
89
+
90
+ - name: rouge_1
91
+ display_name: ROUGE-1
92
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
93
+ lower_is_better: false
94
+ - name: rouge_2
95
+ display_name: ROUGE-2
96
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
97
+ lower_is_better: false
98
+ - name: rouge_l
99
+ display_name: ROUGE-L
100
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
101
+ lower_is_better: false
102
+ - name: bleu_1
103
+ display_name: BLEU-1
104
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
105
+ lower_is_better: false
106
+ - name: bleu_4
107
+ display_name: BLEU-4
108
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
109
+ lower_is_better: false
110
+ - name: f1_score
111
+ display_name: F1
112
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
113
+ lower_is_better: false
114
+ - name: cider
115
+ display_name: CIDEr
116
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
117
+ lower_is_better: false
118
+
119
+ # Machine Translation metrics
120
+ - name: bleu
121
+ display_name: BLEU
122
+ short_display_name: BLEU
123
+ description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
124
+ lower_is_better: false
125
+
126
+ # Speech Recognition metrics
127
+ - name: wer_score
128
+ display_name: Word Error Rate
129
+ short_display_name: WER
130
+ description: Word error rate between model predictions and ground truth answers for ASR tasks.
131
+ lower_is_better: true
132
+
133
+ - name: mer_score
134
+ display_name: Match Error Rate
135
+ short_display_name: MER
136
+ description: Word match error rate between model predictions and ground truth answers.
137
+ lower_is_better: true
138
+
139
+ - name: wip_score
140
+ display_name: Word Information Preservation
141
+ short_display_name: WIP
142
+ description: Word information preservation (WIP) for evaluating the preserved information of ASR.
143
+ lower_is_better: false
144
+
145
+ - name: cer_score
146
+ display_name: Character Error Rate
147
+ short_display_name: CER
148
+ description: Character error rate (CER) for evaluating the accuracy of ASR.
149
+ lower_is_better: true
150
+
151
+ - name: chinese_wer_score
152
+ display_name: Chinese Word Error Rate
153
+ short_display_name: Chinese WER
154
+ description: Chinese word error rate between model predictions and ground truth answers for ASR tasks.
155
+ lower_is_better: true
156
+
157
+ - name: chinese_mer_score
158
+ display_name: Chinese Match Error Rate
159
+ short_display_name: Chinese MER
160
+ description: Chinese word match error rate between model predictions and ground truth answers.
161
+ lower_is_better: true
162
+
163
+ - name: chinese_wip_score
164
+ display_name: Chinese Word Information Preservation
165
+ short_display_name: Chinese WIP
166
+ description: Chinese word information preservation (WIP) for evaluating the preserved information of ASR.
167
+ lower_is_better: false
168
+
169
+ - name: chinese_cer_score
170
+ display_name: Chinese Character Error Rate
171
+ short_display_name: Chinese CER
172
+ description: Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.
173
+ lower_is_better: true
174
+
175
+ # Toxicity metrics
176
+ - name: expected_max_toxicity
177
+ display_name: Average max toxicity
178
+ short_display_name: Avg. max toxicity
179
+ lower_is_better: true
180
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
181
+
182
+ - name: max_toxicity_probability
183
+ display_name: max toxicity
184
+ lower_is_better: true
185
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
186
+
187
+ - name: toxic_frac
188
+ display_name: Toxic fraction
189
+ lower_is_better: true
190
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
191
+
192
+ - name: gpt4_audio_critique
193
+ display_name: GPT-4o Judge Critique
194
+ lower_is_better: false
195
+ description: Judgements by GPT-4o
196
+
197
+ ############################################################
198
+ perturbations: []
199
+
200
+ ############################################################
201
+ metric_groups:
202
+ - name: accuracy
203
+ display_name: Accuracy
204
+ metrics:
205
+ - name: ${main_name}
206
+ split: ${main_split}
207
+
208
+ - name: efficiency
209
+ display_name: Efficiency
210
+ metrics:
211
+ - name: inference_runtime
212
+ split: ${main_split}
213
+
214
+ - name: general_information
215
+ display_name: General information
216
+ metrics:
217
+ - name: num_instances
218
+ split: ${main_split}
219
+ - name: num_train_instances
220
+ split: ${main_split}
221
+ - name: prompt_truncated
222
+ split: ${main_split}
223
+ - name: num_prompt_tokens
224
+ split: ${main_split}
225
+ - name: num_output_tokens
226
+ split: ${main_split}
227
+
228
+ ############################################################
229
+
230
+ run_groups:
231
+ - name: audio_scenarios
232
+ display_name: Audio Scenarios
233
+ description: Audio Scenarios
234
+ category: All scenarios
235
+ subgroups:
236
+ - auditory_perception
237
+ - emotion_detection
238
+ - knowledge
239
+ - robustness
240
+ - multilinguality
241
+ - toxicity_detection
242
+ - safety
243
+ - fairness
244
+ - bias
245
+
246
+ - name: auditory_perception
247
+ display_name: Auditory Perception
248
+ description: Interpreting fundamental information in an audio clip
249
+ category: Core scenarios
250
+ subgroups:
251
+ - audiocaps
252
+ - voxceleb2
253
+ - vocal_sound
254
+ - librispeech
255
+
256
+ - name: emotion_detection
257
+ display_name: Emotion Detection
258
+ description: Detecting emotions in audio clips.
259
+ category: Core scenarios
260
+ subgroups:
261
+ - meld_audio
262
+ - mustard
263
+
264
+ - name: knowledge
265
+ display_name: Knowledge
266
+ description: Recalling facts or information contained in the audio LLM.
267
+ category: Core scenarios
268
+ subgroups:
269
+ - air_bench_chat
270
+ - air_bench_foundation
271
+
272
+ - name: robustness
273
+ display_name: Robustness
274
+ description: Producing desired answers under invariant perturbations to the audio.
275
+ category: Core scenarios
276
+ subgroups:
277
+ - speech_robust_bench
278
+
279
+ - name: multilinguality
280
+ display_name: Multilinguality
281
+ description: Performs the same task when the language is changed
282
+ category: Core scenarios
283
+ subgroups:
284
+ - covost2
285
+ - fleurs
286
+ - multilingual_librispeech
287
+
288
+ - name: toxicity_detection
289
+ display_name: Toxicity detection
290
+ description: Identifying and avoiding offensive or damaging materials.
291
+ category: Core scenarios
292
+ subgroups:
293
+ - mutox
294
+
295
+ - name: safety
296
+ display_name: Safety
297
+ description: Refusing to produce answers that cause harm to humans.
298
+ category: Core scenarios
299
+ subgroups:
300
+ - voice_jailbreak_attacks
301
+
302
+ - name: fairness
303
+ display_name: Fairness
304
+ description: Whether the model is fair to all groups or demographics.
305
+ category: Core scenarios
306
+ subgroups:
307
+ - fleurs_fairness
308
+ - librispeech_fairness
309
+
310
+ - name: bias
311
+ display_name: Bias
312
+ description: Whether the model is biased towards certain groups or demographics.
313
+ category: Core scenarios
314
+ subgroups:
315
+ - parade
316
+
317
+ - name: covost2
318
+ display_name: CoVost-2
319
+ description: >
320
+ CoVost-2 is a large-scale multilingual speech translation corpus covering translations from 21 languages
321
+ into English and from English into 15 languages.
322
+
323
+ The dataset contains the audio, transcriptions, and translations in the following languages:
324
+ French, German, Dutch, Russian, Spanish, Italian, Turkish, Persian, Swedish, Mongolian, Chinese,
325
+ Welsh, Catalan, Slovenian, Estonian, Indonesian, Arabic, Tamil, Portuguese, Latvian, and Japanese
326
+ ([Wang et al, 2020](https://arxiv.org/abs/2007.10310)).
327
+ metric_groups:
328
+ - accuracy
329
+ - general_information
330
+ environment:
331
+ main_name: bleu
332
+ main_split: test
333
+ taxonomy:
334
+ task: audio machine translation
335
+ what: audio, transcriptions, and translations in 15 languages
336
+ who: real speakers
337
+ when: "2020"
338
+ language: 15 languages
339
+
340
+ - name: vocal_sound
341
+ display_name: VocalSound
342
+ description: >
343
+ VocalSound dataset consisting of over 21,000 crowdsourced recordings of laughter, sighs, coughs, throat
344
+ clearing, sneezes, and sniffs from 3,365 unique subjects.
345
+
346
+ Different from previous datasets, the VocalSound dataset contains meta information such as speaker
347
+ age, gender, native language, country, and health condition ([Gong et al, 2022](https://arxiv.org/abs/2205.03433)).
348
+ metric_groups:
349
+ - accuracy
350
+ - general_information
351
+ environment:
352
+ main_name: quasi_exact_match
353
+ main_split: test
354
+ taxonomy:
355
+ task: audio classification
356
+ what: audio samples of behaviours ("cough", "laughter", "sigh", "sneeze", "sniff", "throatclearing")
357
+ who: real speakers
358
+ when: "2022"
359
+ language: English
360
+
361
+ - name: multilingual_librispeech
362
+ display_name: Multilingual Librispeech
363
+ description: >
364
+ Multilingual Librispeech is derived from read audiobooks from LibriVox and consists of 8 languages,
365
+ including about 44.5K hours of English and a total of about 6K hours for other languages.
366
+
367
+ The dataset contains the audio and transcriptions in the following languages:
368
+ Dutch, German, French, Spanish, Italian, Portuguese", Polish ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411)).
369
+ metric_groups:
370
+ - accuracy
371
+ - general_information
372
+ environment:
373
+ main_name: wer_score
374
+ main_split: test
375
+ taxonomy:
376
+ task: audio captioning
377
+ what: audio and transcriptions in 8 languages
378
+ who: real speakers
379
+ when: "2020"
380
+ language: 8 languages
381
+
382
+ - name: fleurs
383
+ display_name: FLEURS
384
+ description: >
385
+ FLEURS is an n-way parallel speech dataset in 102 languages built on top of the machine translation FLoRes-101
386
+ benchmark, with approximately 12 hours of speech supervision per language. FLEURS can be used for a variety of
387
+ speech tasks, including Automatic Speech Recognition (ASR), Speech Language Identification (Speech LangID),
388
+ Translation and Retrieval.
389
+
390
+ The dataset contains the audio, transcriptions, and language in 102 different languages, which are divided into
391
+ 7 language groups: Western European, Eastern European, Central Asia Middle North African, Sub Saharan African,
392
+ South Asian, South East Asian, Chinese Japanase Korean ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446)).
393
+ metric_groups:
394
+ - accuracy
395
+ - general_information
396
+ environment:
397
+ main_name: wer_score
398
+ main_split: test
399
+ taxonomy:
400
+ task: audio classification
401
+ what: audio, transcripts, and language names in 102 languages
402
+ who: real speakers
403
+ when: "2022"
404
+ language: 102 languages
405
+
406
+ - name: fleurs_fairness
407
+ display_name: FLEURS Fairness
408
+ description: >
409
+ FLEURS is an n-way parallel speech dataset in 102 languages built on top of the machine translation FLoRes-101
410
+ benchmark, with approximately 12 hours of speech supervision per language. FLEURS can be used for a variety of
411
+ speech tasks, including Automatic Speech Recognition (ASR), Speech Language Identification (Speech LangID),
412
+ Translation and Retrieval.
413
+
414
+ We only use the English subset of the dataset for the fairness task. We ask the model to do ASR on
415
+ audio files from different gender groups ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446)).
416
+ metric_groups:
417
+ - accuracy
418
+ - general_information
419
+ environment:
420
+ main_name: wer_score
421
+ main_split: test
422
+ taxonomy:
423
+ task: audio classification
424
+ what: audio, transcripts, and gender of the speaker
425
+ who: real speakers
426
+ when: "2022"
427
+ language: English
428
+
429
+ - name: audiocaps
430
+ display_name: AudioCaps
431
+ description: >
432
+ AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
433
+ via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
434
+ musical instruments and genres, and common everyday environmental sounds.
435
+ ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
436
+ metric_groups:
437
+ - accuracy
438
+ - general_information
439
+ environment:
440
+ main_name: gpt4_audio_critique
441
+ main_split: test
442
+ taxonomy:
443
+ task: audio captioning
444
+ what: audio clips in the wild
445
+ who: real speakers
446
+ when: "2019"
447
+ language: English
448
+
449
+ - name: voxceleb2
450
+ display_name: VoxCeleb2
451
+ description: >
452
+ VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from
453
+ interview videos uploaded to YouTube. It contains over a million utterances from over 6,000
454
+ speakers with their gender, race, identity information in 145 different nationalities, covering
455
+ a wide range of accents, ages, ethnicities and languages.
456
+ ([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
457
+ metric_groups:
458
+ - accuracy
459
+ - general_information
460
+ environment:
461
+ main_name: exact_match
462
+ main_split: test
463
+ taxonomy:
464
+ task: audio identification
465
+ what: audio clips in the wild
466
+ who: real speakers
467
+ when: "2018"
468
+ language: English, Germany, French
469
+
470
+ - name: speech_robust_bench
471
+ display_name: Robust Speech Bench
472
+ description: >
473
+ Speech Robust Bench (Shah et al, 2024) is a comprehensive benchmark for evaluating
474
+ the robustness of ASR models to diverse corruptions. SRB is composed of 114 input
475
+ perturbations which simulate an heterogeneous range of corruptions that ASR models
476
+ may encounter when deployed in the wild. In this scenario, we select 4 subsets:
477
+ accent_cv, accent_cv_es, chinme, and AIM for evaluation.
478
+
479
+ The dataset contains the audio, transcriptions for all subsets
480
+ ([Shah et al, 2024](https://arxiv.org/abs/2403.07937)).
481
+ metric_groups:
482
+ - accuracy
483
+ - general_information
484
+ environment:
485
+ main_name: wer_score
486
+ main_split: test
487
+ taxonomy:
488
+ task: audio recognition
489
+ what: audio, transcripts of audio samples in a wide range of perturbations
490
+ who: real speakers
491
+ when: "2024"
492
+ language: English, Spanish
493
+
494
+ - name: audio_pairs
495
+ display_name: Audio PAIRS
496
+ description: >
497
+ Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
498
+ racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
499
+ clips using OpenAI's TTS-1-HD API. This dataset is also modified to add an option to opt-out with
500
+ "unclear" as a choice.
501
+
502
+ The dataset contains the audio and question for three subsets: occupation, status, and potential_crime.
503
+ metric_groups:
504
+ - accuracy
505
+ - general_information
506
+ environment:
507
+ main_name: exact_match
508
+ main_split: test
509
+ taxonomy:
510
+ task: audio classification
511
+ what: audio and question of audio samples to examine models' gender and racial bias
512
+ who: OpenAI's TTS-1-HD
513
+ when: "2024"
514
+ language: English
515
+
516
+ - name: meld_audio
517
+ display_name: Multimodal EmotionLines Dataset (MELD) Audio
518
+ description: >
519
+ Multimodal EmotionLines Dataset (MELD) has been created by enhancing and extending EmotionLines dataset.
520
+ MELD has more than 1400 dialogues and 13000 utterances from Friends TV series. Multiple speakers participated
521
+ in the dialogues. Each utterance in a dialogue has been labeled by any of these seven emotions -
522
+ Anger, Disgust, Sadness, Joy, Neutral, Surprise and Fear.
523
+ metric_groups:
524
+ - accuracy
525
+ - general_information
526
+ environment:
527
+ main_name: quasi_exact_match
528
+ main_split: test
529
+ taxonomy:
530
+ task: audio classification
531
+ what: Classify audio by emotion
532
+ who: Friends TV series
533
+ when: "2018"
534
+ language: English
535
+
536
+ - name: casual_conversations2
537
+ display_name: Casual Conversations 2
538
+ description: >
539
+ Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
540
+ The videos feature paid individuals who agreed to participate in the project and explicitly provided
541
+ Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
542
+ themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
543
+ and Vietnam with a diverse set of adults in various categories.
544
+
545
+ The dataset contains two classification tasks: age and gender classification
546
+ ([Porgali et al., 2023](https://arxiv.org/abs/2303.04838)). We phrase these two tasks as the multi-choice
547
+ questions answering task.
548
+ metric_groups:
549
+ - accuracy
550
+ - general_information
551
+ environment:
552
+ main_name: exact_match
553
+ main_split: test
554
+ taxonomy:
555
+ task: audio classification
556
+ what: audio, spoken language, speaker's gender, age information of audio samples
557
+ who: real speakers
558
+ when: "2023"
559
+ language: 10 languages
560
+
561
+ - name: air_bench_chat
562
+ display_name: Air-Bench Chat
563
+ description: >
564
+ Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
565
+ approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
566
+ We consider the chat benchmark in this scenario.
567
+
568
+ The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed.
569
+ ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
570
+ metric_groups:
571
+ - accuracy
572
+ - general_information
573
+ environment:
574
+ main_name: gpt4_audio_critique
575
+ main_split: test
576
+ taxonomy:
577
+ task: audio question answering
578
+ what: audio, question, and answer of audio samples
579
+ who: real speakers
580
+ when: "2024"
581
+ language: English
582
+
583
+ - name: air_bench_foundation
584
+ display_name: Air-Bench Foundation
585
+ description: >
586
+ Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with
587
+ approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data.
588
+ We consider the chat benchmark in this scenario.
589
+
590
+ The dataset contains the audio multiple choice question answering task in three subjects: music instrument classification, music
591
+ genera recognition, and music question-answering.
592
+ ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
593
+ metric_groups:
594
+ - accuracy
595
+ - general_information
596
+ environment:
597
+ main_name: exact_match
598
+ main_split: test
599
+ taxonomy:
600
+ task: audio question answering
601
+ what: audio, question, and answer of the question
602
+ who: real speakers
603
+ when: "2024"
604
+ language: English
605
+
606
+ - name: mutox
607
+ display_name: MuTox
608
+ description: >
609
+ The dataset consists of 20k audio utterances for English and Spanish, and 4k for the other languages.
610
+ ([Costa-jussà et al, 2018](https://arxiv.org/abs/2401.05060)).
611
+ metric_groups:
612
+ - accuracy
613
+ - general_information
614
+ environment:
615
+ main_name: exact_match
616
+ main_split: test
617
+ taxonomy:
618
+ task: toxicity detection
619
+ what: samples of utterances
620
+ who: real speakers
621
+ when: "2024"
622
+ language: 30 languages
623
+
624
+ - name: mustard
625
+ display_name: MUStARD
626
+ description: >
627
+ A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
628
+ TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous.
629
+ ([Castro et al, 2019](https://arxiv.org/abs/1906.01815)).
630
+ metric_groups:
631
+ - accuracy
632
+ - general_information
633
+ environment:
634
+ main_name: exact_match
635
+ main_split: test
636
+ taxonomy:
637
+ task: sarcasm detection
638
+ what: clips from television shows
639
+ who: real speakers
640
+ when: "2019"
641
+ language: English
642
+
643
+ - name: voice_jailbreak_attacks
644
+ display_name: Voice Jailbreak Attacks Against GPT-4o
645
+ description: >
646
+ Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103)).
647
+ metric_groups:
648
+ - accuracy
649
+ - general_information
650
+ environment:
651
+ main_name: toxic_frac
652
+ main_split: test
653
+ taxonomy:
654
+ task: refusal for safety
655
+ what: voice jailbreak attacks against GPT-4o
656
+ who: AI-generated speech
657
+ when: "2024"
658
+ language: English
659
+
660
+ - name: ami
661
+ display_name: AMI Meeting Corpus
662
+ description: >
663
+ The AMI Meeting Corpus (Carletta et al. 2005) is a multi-modal data set consisting of
664
+ 100 hours of meeting recordings. It is being created in the context of a project that
665
+ is developing meeting browsing technology. The corpus is being recorded using a wide
666
+ range of devices including close-talking and far-field microphones, individual and
667
+ room-view video cameras, projection, a whiteboard, and individual pens, all of which
668
+ produce output signals that are synchronized with each other.
669
+
670
+ The dataset contains the audio, transcriptions for all subsets
671
+ ([Carletta et al, 2005](https://link.springer.com/chapter/10.1007/11677482_3)).
672
+ metric_groups:
673
+ - accuracy
674
+ - general_information
675
+ environment:
676
+ main_name: wer_score
677
+ main_split: test
678
+ taxonomy:
679
+ task: audio recognition
680
+ what: audio, transcripts of audio samples from meeting environments
681
+ who: real speakers
682
+ when: "2005"
683
+ language: English
684
+
685
+ - name: librispeech
686
+ display_name: LibriSpeech
687
+ description: >
688
+ The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part
689
+ of the LibriVox project, and contains 1000 hours of speech sampled at 16 kHz. The
690
+ data has separately prepared language-model training data and pre-built language models.
691
+ This corpus is one of the most widely-used ASR corpus, which has been extended to many
692
+ applicaitons such as robust ASR and multilingual ASR tasks.
693
+
694
+ The dataset contains the audio, transcriptions for all subsets
695
+ ([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964)).
696
+ metric_groups:
697
+ - accuracy
698
+ - general_information
699
+ environment:
700
+ main_name: wer_score
701
+ main_split: test
702
+ taxonomy:
703
+ task: audio recognition
704
+ what: audio, transcripts of audio samples in daily scenarios
705
+ who: real speakers
706
+ when: "2015"
707
+ language: English
708
+
709
+ - name: librispeech_fairness
710
+ display_name: LibriSpeech Fairness
711
+ description: >
712
+ The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part
713
+ of the LibriVox project, and contains 1000 hours of speech sampled at 16 kHz. The
714
+ data has separately prepared language-model training data and pre-built language models.
715
+ This corpus is one of the most widely-used ASR corpus, which has been extended to many
716
+ applicaitons such as robust ASR and multilingual ASR tasks.
717
+
718
+ The dataset contains the audio, transcriptions for all subsets. We ask the model to do
719
+ ASR on audio files from different gender groups
720
+ ([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964)).
721
+ metric_groups:
722
+ - accuracy
723
+ - general_information
724
+ environment:
725
+ main_name: wer_score
726
+ main_split: test
727
+ taxonomy:
728
+ task: audio recognition
729
+ what: audio, transcripts of audio samples in daily scenarios
730
+ who: real speakers
731
+ when: "2015"
732
+ language: English
733
+
734
+ - name: parade
735
+ display_name: PARADE
736
+ description: >
737
+ The PARADE dataset is inspired by the PAIRS dataset for evaluating occupation and status
738
+ bias in vision-language models. We collect a new dataset of audio-text multi-choice QA
739
+ task that involves exploring occupation and status bias. The dataset consists of 436
740
+ audio-text QA pairs with 3 options each.
741
+ metric_groups:
742
+ - accuracy
743
+ - general_information
744
+ environment:
745
+ main_name: exact_match
746
+ main_split: test
747
+ taxonomy:
748
+ task: audio classification
749
+ what: audio, question and answer given the audio
750
+ who: OpenAI's TTS
751
+ when: "2025"
752
+ language: English