crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,312 @@
1
+ """Scenarios for audio models"""
2
+
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from collections import OrderedDict
15
+ from tqdm import tqdm
16
+ from datasets import load_dataset
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+
19
+
20
+ class FLEURSScenario(Scenario):
21
+ """FLEURS Scenario
22
+
23
+ The FLEURS (Conneau et al, 2022) dataset is an n-way parallel speech dataset in 102 languages
24
+ built on top of the machine translation FLoRes-101 benchmark, with approximately 12 hours of speech
25
+ supervision per language. The task is to identify the language used from the audio sample
26
+ (the Speech Language Identification task).
27
+
28
+ Paper: https://arxiv.org/abs/2205.12446
29
+ Code: https://tensorflow.org/datasets/catalog/xtreme_s
30
+
31
+ Citation:
32
+ @inproceedings{conneau2023fleurs,
33
+ title={Fleurs: Few-shot learning evaluation of universal representations of speech},
34
+ author={Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu and Axelrod,
35
+ Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara and Bapna, Ankur},
36
+ booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
37
+ pages={798--805},
38
+ year={2023},
39
+ organization={IEEE}
40
+ }
41
+ """
42
+
43
+ HF_DATASET_NAME = "google/xtreme_s"
44
+
45
+ _FLEURS_LANG_TO_ID = OrderedDict(
46
+ [
47
+ ("Afrikaans", "af"),
48
+ ("Amharic", "am"),
49
+ ("Arabic", "ar"),
50
+ ("Armenian", "hy"),
51
+ ("Assamese", "as"),
52
+ ("Asturian", "ast"),
53
+ ("Azerbaijani", "az"),
54
+ ("Belarusian", "be"),
55
+ ("Bengali", "bn"),
56
+ ("Bosnian", "bs"),
57
+ ("Bulgarian", "bg"),
58
+ ("Burmese", "my"),
59
+ ("Catalan", "ca"),
60
+ ("Cebuano", "ceb"),
61
+ ("Mandarin_Chinese", "cmn_hans"),
62
+ ("Cantonese_Chinese", "yue_hant"),
63
+ ("Croatian", "hr"),
64
+ ("Czech", "cs"),
65
+ ("Danish", "da"),
66
+ ("Dutch", "nl"),
67
+ ("English", "en"),
68
+ ("Estonian", "et"),
69
+ ("Filipino", "fil"),
70
+ ("Finnish", "fi"),
71
+ ("French", "fr"),
72
+ ("Fula", "ff"),
73
+ ("Galician", "gl"),
74
+ ("Ganda", "lg"),
75
+ ("Georgian", "ka"),
76
+ ("German", "de"),
77
+ ("Greek", "el"),
78
+ ("Gujarati", "gu"),
79
+ ("Hausa", "ha"),
80
+ ("Hebrew", "he"),
81
+ ("Hindi", "hi"),
82
+ ("Hungarian", "hu"),
83
+ ("Icelandic", "is"),
84
+ ("Igbo", "ig"),
85
+ ("Indonesian", "id"),
86
+ ("Irish", "ga"),
87
+ ("Italian", "it"),
88
+ ("Japanese", "ja"),
89
+ ("Javanese", "jv"),
90
+ ("Kabuverdianu", "kea"),
91
+ ("Kamba", "kam"),
92
+ ("Kannada", "kn"),
93
+ ("Kazakh", "kk"),
94
+ ("Khmer", "km"),
95
+ ("Korean", "ko"),
96
+ ("Kyrgyz", "ky"),
97
+ ("Lao", "lo"),
98
+ ("Latvian", "lv"),
99
+ ("Lingala", "ln"),
100
+ ("Lithuanian", "lt"),
101
+ ("Luo", "luo"),
102
+ ("Luxembourgish", "lb"),
103
+ ("Macedonian", "mk"),
104
+ ("Malay", "ms"),
105
+ ("Malayalam", "ml"),
106
+ ("Maltese", "mt"),
107
+ ("Maori", "mi"),
108
+ ("Marathi", "mr"),
109
+ ("Mongolian", "mn"),
110
+ ("Nepali", "ne"),
111
+ ("Northern-Sotho", "nso"),
112
+ ("Norwegian", "nb"),
113
+ ("Nyanja", "ny"),
114
+ ("Occitan", "oc"),
115
+ ("Oriya", "or"),
116
+ ("Oromo", "om"),
117
+ ("Pashto", "ps"),
118
+ ("Persian", "fa"),
119
+ ("Polish", "pl"),
120
+ ("Portuguese", "pt"),
121
+ ("Punjabi", "pa"),
122
+ ("Romanian", "ro"),
123
+ ("Russian", "ru"),
124
+ ("Serbian", "sr"),
125
+ ("Shona", "sn"),
126
+ ("Sindhi", "sd"),
127
+ ("Slovak", "sk"),
128
+ ("Slovenian", "sl"),
129
+ ("Somali", "so"),
130
+ ("Sorani-Kurdish", "ckb"),
131
+ ("Spanish", "es"),
132
+ ("Swahili", "sw"),
133
+ ("Swedish", "sv"),
134
+ ("Tajik", "tg"),
135
+ ("Tamil", "ta"),
136
+ ("Telugu", "te"),
137
+ ("Thai", "th"),
138
+ ("Turkish", "tr"),
139
+ ("Ukrainian", "uk"),
140
+ ("Umbundu", "umb"),
141
+ ("Urdu", "ur"),
142
+ ("Uzbek", "uz"),
143
+ ("Vietnamese", "vi"),
144
+ ("Welsh", "cy"),
145
+ ("Wolof", "wo"),
146
+ ("Xhosa", "xh"),
147
+ ("Yoruba", "yo"),
148
+ ("Zulu", "zu"),
149
+ ]
150
+ )
151
+ _FLEURS_LANG = sorted(
152
+ [
153
+ "af_za",
154
+ "am_et",
155
+ "ar_eg",
156
+ "as_in",
157
+ "ast_es",
158
+ "az_az",
159
+ "be_by",
160
+ "bn_in",
161
+ "bs_ba",
162
+ "ca_es",
163
+ "ceb_ph",
164
+ "cmn_hans_cn",
165
+ "yue_hant_hk",
166
+ "cs_cz",
167
+ "cy_gb",
168
+ "da_dk",
169
+ "de_de",
170
+ "el_gr",
171
+ "en_us",
172
+ "es_419",
173
+ "et_ee",
174
+ "fa_ir",
175
+ "ff_sn",
176
+ "fi_fi",
177
+ "fil_ph",
178
+ "fr_fr",
179
+ "ga_ie",
180
+ "gl_es",
181
+ "gu_in",
182
+ "ha_ng",
183
+ "he_il",
184
+ "hi_in",
185
+ "hr_hr",
186
+ "hu_hu",
187
+ "hy_am",
188
+ "id_id",
189
+ "ig_ng",
190
+ "is_is",
191
+ "it_it",
192
+ "ja_jp",
193
+ "jv_id",
194
+ "ka_ge",
195
+ "kam_ke",
196
+ "kea_cv",
197
+ "kk_kz",
198
+ "km_kh",
199
+ "kn_in",
200
+ "ko_kr",
201
+ "ckb_iq",
202
+ "ky_kg",
203
+ "lb_lu",
204
+ "lg_ug",
205
+ "ln_cd",
206
+ "lo_la",
207
+ "lt_lt",
208
+ "luo_ke",
209
+ "lv_lv",
210
+ "mi_nz",
211
+ "mk_mk",
212
+ "ml_in",
213
+ "mn_mn",
214
+ "mr_in",
215
+ "ms_my",
216
+ "mt_mt",
217
+ "my_mm",
218
+ "nb_no",
219
+ "ne_np",
220
+ "nl_nl",
221
+ "nso_za",
222
+ "ny_mw",
223
+ "oc_fr",
224
+ "om_et",
225
+ "or_in",
226
+ "pa_in",
227
+ "pl_pl",
228
+ "ps_af",
229
+ "pt_br",
230
+ "ro_ro",
231
+ "ru_ru",
232
+ "bg_bg",
233
+ "sd_in",
234
+ "sk_sk",
235
+ "sl_si",
236
+ "sn_zw",
237
+ "so_so",
238
+ "sr_rs",
239
+ "sv_se",
240
+ "sw_ke",
241
+ "ta_in",
242
+ "te_in",
243
+ "tg_tj",
244
+ "th_th",
245
+ "tr_tr",
246
+ "uk_ua",
247
+ "umb_ao",
248
+ "ur_pk",
249
+ "uz_uz",
250
+ "vi_vn",
251
+ "wo_sn",
252
+ "xh_za",
253
+ "yo_ng",
254
+ "zu_za",
255
+ ]
256
+ )
257
+
258
+ # Randomly selected 7 languages from 7 different groups (western_european_we, eastern_european_ee,
259
+ # central_asia_middle_north_african_cmn, sub_saharan_african_ssa, south_asian_sa, south_east_asian_sea,
260
+ # chinese_japanase_korean_cjk) in the FLEURS dataset.
261
+ _FLEURS_TEST_LANG_TO_ID = OrderedDict(
262
+ [
263
+ ("Finnish", "fi"),
264
+ ("English", "en"),
265
+ ("Hebrew", "he"),
266
+ ("Zulu", "zu"),
267
+ ("Bengali", "bn"),
268
+ ("Thai", "th"),
269
+ ("Mandarin_Chinese", "cmn_hans"),
270
+ ]
271
+ )
272
+
273
+ name = "fleurs"
274
+ description = "Language identification for seven languages from seven different language groups \
275
+ ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446))."
276
+ tags: List[str] = ["audio", "recognition", "multilinguality"]
277
+
278
+ def __init__(self, language: str) -> None:
279
+ super().__init__()
280
+
281
+ if language not in FLEURSScenario._FLEURS_TEST_LANG_TO_ID.keys():
282
+ raise ValueError(
283
+ f"Invalid language: {language}. Valid languages are: {FLEURSScenario._FLEURS_TEST_LANG_TO_ID.keys()}"
284
+ )
285
+
286
+ self._fleurs_lang_short_to_long = {v: k for k, v in FLEURSScenario._FLEURS_LANG_TO_ID.items()}
287
+ self._fleurs_long_to_lang = {
288
+ self._fleurs_lang_short_to_long["_".join(k.split("_")[:-1]) or k]: k for k in FLEURSScenario._FLEURS_LANG
289
+ }
290
+
291
+ self._language: str = language
292
+
293
+ def get_instances(self, output_path: str) -> List[Instance]:
294
+ instances: List[Instance] = []
295
+ language_category = self._fleurs_long_to_lang[self._language]
296
+ for row in tqdm(
297
+ load_dataset(
298
+ FLEURSScenario.HF_DATASET_NAME,
299
+ name=f"fleurs.{language_category}",
300
+ cache_dir=output_path,
301
+ split=TEST_SPLIT,
302
+ trust_remote_code=True,
303
+ )
304
+ ):
305
+ local_audio_path = row["path"]
306
+ answer = row["transcription"]
307
+ input = Input(
308
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
309
+ )
310
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
311
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
312
+ return instances
@@ -0,0 +1,83 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
15
+ from helm.common.general import ensure_directory_exists
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+
18
+
19
+ class IEMOCAPAudioScenario(Scenario):
20
+ """IEMOCAP (Audio)
21
+
22
+ This scenario is a emotion classification scenario based on the
23
+ "Interactive emotional dyadic motion capture database" (IEMOCAP),
24
+ collected by the Speech Analysis and Interpretation Laboratory (SAIL)
25
+ at the University of Southern California (USC). Only the audio data
26
+ from this dataset is used. The task is to classify the emotion of the
27
+ speaker(s) in the audio sample as one of angry, happy, neutral or sad.
28
+
29
+ Website: https://sail.usc.edu/iemocap/iemocap_release.htm
30
+ Paper: https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf
31
+ Dataset: https://huggingface.co/datasets/Zahra99/IEMOCAP_Audio/blob/main/README.md
32
+
33
+ Citation:
34
+ @article{article,
35
+ author = {Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower Provost, Emily and Kim, Samuel and Chang, Jeannette and Lee, Sungbok and Narayanan, Shrikanth},
36
+ year = {2008},
37
+ month = {12},
38
+ pages = {335-359},
39
+ title = {IEMOCAP: Interactive emotional dyadic motion capture database},
40
+ volume = {42},
41
+ journal = {Language Resources and Evaluation},
42
+ doi = {10.1007/s10579-008-9076-6}
43
+ }
44
+ """ # noqa: E501
45
+
46
+ name = "iemocap_audio"
47
+ description = "A classification scenario based on audio data from the Interactive emotional dyadic motion capture database (IEMOCAP) ([Busso et al, 2008](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf)). The task is to classify the emotion of the speaker(s) in the audio sample." # noqa: E501
48
+ tags = ["audio", "classification"]
49
+
50
+ LABEL_NAMES = ["angry", "happy", "neutral", "sad"]
51
+ SAMPLE_RATE = 16000
52
+
53
+ def get_instances(self, output_path: str) -> List[Instance]:
54
+ cache_dir = os.path.join(output_path, "data")
55
+ ensure_directory_exists(cache_dir)
56
+ wav_dir = os.path.join(output_path, "wav")
57
+ ensure_directory_exists(wav_dir)
58
+
59
+ dataset = datasets.load_dataset(
60
+ "Zahra99/IEMOCAP_Audio", revision="4f8539a397ecc0d7185bf941bc1bb7238abc3648", cache_dir=cache_dir
61
+ )
62
+ instances: List[Instance] = []
63
+ for _, split in dataset.items():
64
+ for row in split:
65
+ wav_path = os.path.join(wav_dir, row["audio"]["path"])
66
+ print(len(row["audio"]["array"]))
67
+ print(list(row["audio"]["array"])[0:10])
68
+ ensure_audio_file_exists_from_array(
69
+ wav_path, row["audio"]["array"], sample_rate=IEMOCAPAudioScenario.SAMPLE_RATE
70
+ )
71
+ input = Input(
72
+ multimedia_content=MultimediaObject(
73
+ media_objects=[MediaObject(location=wav_path, content_type="audio/wav")]
74
+ )
75
+ )
76
+ references = [
77
+ Reference(output=Output(text=IEMOCAPAudioScenario.LABEL_NAMES[row["label"]]), tags=[CORRECT_TAG])
78
+ ]
79
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
80
+ instances.append(instance)
81
+ print(row["audio"])
82
+ break
83
+ return instances
@@ -0,0 +1,96 @@
1
+ from typing import List
2
+ import os
3
+ import json
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from collections import OrderedDict
15
+ from tqdm import tqdm
16
+ from datasets import load_dataset
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
19
+ from helm.common.general import ensure_file_downloaded
20
+
21
+
22
+ class LibriSpeechFairnessScenario(Scenario):
23
+ """LibriSpeech Fairness Scenario
24
+ The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part of the LibriVox
25
+ project, and contains 1000 hours of speech sampled at 16 kHz. The data has separately prepared language-model
26
+ training data and pre-built language models. This corpus is one of the most widely-used ASR corpus, which
27
+ has been extended to many applicaitons such as robust ASR and multilingual ASR tasks.
28
+
29
+ Paper: https://ieeexplore.ieee.org/document/7178964
30
+ Code: https://www.openslr.org/12
31
+
32
+ Citation:
33
+ @INPROCEEDINGS{7178964,
34
+ author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
35
+ booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
36
+ title={Librispeech: An ASR corpus based on public domain audio books},
37
+ year={2015},
38
+ doi={10.1109/ICASSP.2015.7178964}}
39
+ """
40
+
41
+ HF_DATASET_NAME = "openslr/librispeech_asr"
42
+ HF_MAPPING_URL = (
43
+ "https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
44
+ )
45
+ GENDER_MAPPING_URL = (
46
+ "https://huggingface.co/datasets/PahaII/librispeech_id2gender/resolve/main/librispeech_id2gender.json"
47
+ )
48
+ GENDERS = ["male", "female"]
49
+
50
+ name = "librispeech_fairness"
51
+ description = (
52
+ "Widely-used speech corpus for the speech recognition task "
53
+ "([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964))."
54
+ )
55
+ tags: List[str] = ["audio", "recognition"]
56
+
57
+ def __init__(self, gender: str) -> None:
58
+ super().__init__()
59
+
60
+ if gender.lower() not in LibriSpeechFairnessScenario.GENDERS:
61
+ raise ValueError(
62
+ f"Invalid gender input: {gender}. Valid languages are: {LibriSpeechFairnessScenario.GENDERS}"
63
+ )
64
+
65
+ self._gender: str = gender
66
+
67
+ def get_instances(self, output_path: str) -> List[Instance]:
68
+ instances: List[Instance] = []
69
+ audio_save_dir = os.path.join(output_path, "audio_files")
70
+ mapping_local_path = os.path.join(output_path, "librispeech_id2gender.json")
71
+ ensure_file_downloaded(
72
+ source_url=LibriSpeechFairnessScenario.GENDER_MAPPING_URL, target_path=mapping_local_path
73
+ )
74
+ meta_data = load_dataset(
75
+ LibriSpeechFairnessScenario.HF_DATASET_NAME,
76
+ name="clean",
77
+ cache_dir=output_path,
78
+ split=TEST_SPLIT,
79
+ )
80
+ gender_mapping_dict = json.load(open(mapping_local_path))
81
+ loading_cases: List[OrderedDict] = []
82
+ for row in tqdm(meta_data):
83
+ if gender_mapping_dict[str(row["speaker_id"])] == self._gender.lower():
84
+ loading_cases.append(row)
85
+
86
+ for row in tqdm(loading_cases):
87
+ local_audio_name = f"librispeech_{row['id']}.mp3"
88
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
89
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
90
+ answer = row["text"].lower()
91
+ input = Input(
92
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/mp3", location=local_audio_path)])
93
+ )
94
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
95
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
96
+ return instances
@@ -0,0 +1,80 @@
1
+ from typing import List
2
+ import os
3
+ import json
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from tqdm import tqdm
15
+ from datasets import load_dataset
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
18
+ from helm.common.general import ensure_file_downloaded
19
+
20
+
21
+ class LibriSpeechScenario(Scenario):
22
+ """LibriSpeech Corpus
23
+ The LibriSpeech corpus (Vassil et al. 2015) is derived from audiobooks that are part of the LibriVox
24
+ project, and contains 1000 hours of speech sampled at 16 kHz. The data has separately prepared language-model
25
+ training data and pre-built language models. This corpus is one of the most widely-used ASR corpus, which
26
+ has been extended to many applicaitons such as robust ASR and multilingual ASR tasks.
27
+
28
+ Paper: https://ieeexplore.ieee.org/document/7178964
29
+ Code: https://www.openslr.org/12
30
+
31
+ Citation:
32
+ @INPROCEEDINGS{7178964,
33
+ author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
34
+ booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ title={Librispeech: An ASR corpus based on public domain audio books},
36
+ year={2015},
37
+ doi={10.1109/ICASSP.2015.7178964}}
38
+ """
39
+
40
+ HF_DATASET_NAME = "openslr/librispeech_asr"
41
+ HF_MAPPING_URL = (
42
+ "https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
43
+ )
44
+ SRB_KEY = "srb_librispeech_noises_key2audio"
45
+ SRB_SUBSET = "gnoise.1"
46
+ MAPPING_KEY = "librispeech_id2line"
47
+
48
+ name = "librispeech"
49
+ description = (
50
+ "Widely-used speech corpus for the speech recognition task "
51
+ "([Vassil et al. 2015](https://ieeexplore.ieee.org/document/7178964))."
52
+ )
53
+ tags: List[str] = ["audio", "recognition"]
54
+
55
+ def get_instances(self, output_path: str) -> List[Instance]:
56
+ instances: List[Instance] = []
57
+ audio_save_dir = os.path.join(output_path, "audio_files")
58
+ mapping_local_path = os.path.join(output_path, "srb_instance_keys.json")
59
+ ensure_file_downloaded(source_url=LibriSpeechScenario.HF_MAPPING_URL, target_path=mapping_local_path)
60
+ meta_data = load_dataset(
61
+ LibriSpeechScenario.HF_DATASET_NAME,
62
+ name="clean",
63
+ cache_dir=output_path,
64
+ split=TEST_SPLIT,
65
+ )
66
+ mapping_dict = json.load(open(mapping_local_path))
67
+ srb_mapping_keys = mapping_dict[self.SRB_KEY][self.SRB_SUBSET]
68
+ index2line_num = mapping_dict[self.MAPPING_KEY]
69
+ for line_num in tqdm(list(srb_mapping_keys)):
70
+ row = meta_data[int(index2line_num[line_num])]
71
+ local_audio_name = f"{self.name}_{line_num}.mp3"
72
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
73
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
74
+ answer = row["text"].lower()
75
+ input = Input(
76
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/mp3", location=local_audio_path)])
77
+ )
78
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
79
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
80
+ return instances
@@ -0,0 +1,113 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ TEST_SPLIT,
9
+ TRAIN_SPLIT,
10
+ VALID_SPLIT,
11
+ Scenario,
12
+ Instance,
13
+ Reference,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array, get_array_from_audio_file
19
+ from helm.common.media_object import MediaObject, MultimediaObject
20
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
21
+
22
+
23
+ class MELDAudioScenario(Scenario):
24
+ """Multimodal EmotionLines Dataset (MELD) Audio
25
+
26
+ Multimodal EmotionLines Dataset (MELD) has been created by enhancing and extending EmotionLines dataset.
27
+ MELD has more than 1400 dialogues and 13000 utterances from Friends TV series. Multiple speakers participated
28
+ in the dialogues. Each utterance in a dialogue has been labeled by any of these seven emotions -
29
+ Anger, Disgust, Sadness, Joy, Neutral, Surprise and Fear.
30
+
31
+ The task is to classify the emotion based on only the audio clip.
32
+
33
+ Website: https://affective-meld.github.io/
34
+ Paper: https://arxiv.org/abs/1810.02508
35
+ Dataset: https://huggingface.co/datasets/DavidCombei/Wav2Vec_MELD_Audio
36
+
37
+ Citation:
38
+ S. Poria, D. Hazarika, N. Majumder, G. Naik, R. Mihalcea,
39
+ E. Cambria. MELD: A Multimodal Multi-Party Dataset
40
+ for Emotion Recognition in Conversation. (2018)
41
+
42
+ Chen, S.Y., Hsu, C.C., Kuo, C.C. and Ku, L.W.
43
+ EmotionLines: An Emotion Corpus of Multi-Party
44
+ Conversations. arXiv preprint arXiv:1802.08379 (2018).
45
+ """ # noqa: E501
46
+
47
+ name = "meld_audio"
48
+ description = "Classify emotions in audio clips from the television series Friends ([Poria et al, 2018](https://arxiv.org/abs/1810.02508))." # noqa: E501
49
+ tags = ["audio", "classification"]
50
+
51
+ LABEL_NAMES = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
52
+ CSV_URL_PREFIX = (
53
+ "https://raw.githubusercontent.com/declare-lab/MELD/2d2011b409d3ca2d7e94460cd007d434b1d0a102/data/MELD/"
54
+ )
55
+ SPLIT_NAME_TO_CSV_FILE_NAME = {
56
+ VALID_SPLIT: "dev_sent_emo.csv",
57
+ TRAIN_SPLIT: "train_sent_emo.csv",
58
+ TEST_SPLIT: "test_sent_emo.csv",
59
+ }
60
+ SPLIT_NAME_TO_TGZ_FILE_NAME = {
61
+ VALID_SPLIT: "audios_validation.tgz",
62
+ TRAIN_SPLIT: "audios_train.tgz",
63
+ TEST_SPLIT: "audios_test.tgz",
64
+ }
65
+ SAMPLE_RATE = 16000
66
+
67
+ def get_instances(self, output_path: str) -> List[Instance]:
68
+ csv_dir = os.path.join(output_path, "csv")
69
+ ensure_directory_exists(csv_dir)
70
+
71
+ instances: List[Instance] = []
72
+ # Only download the test split.
73
+ # We don't need the train split (because we use zero shot) or the validation split.
74
+ split_name = TEST_SPLIT
75
+
76
+ # Download the CSV to get the labels and IDs
77
+ csv_file_name = MELDAudioScenario.SPLIT_NAME_TO_CSV_FILE_NAME[split_name]
78
+ csv_file_path = os.path.join(csv_dir, csv_file_name)
79
+ ensure_file_downloaded(MELDAudioScenario.CSV_URL_PREFIX + csv_file_name, csv_file_path)
80
+ df = pd.read_csv(csv_file_path, header=0).rename(columns={"Sr No.": "serial_number"})
81
+
82
+ # Download FLAC files
83
+ flac_dir = os.path.join(output_path, f"flac_{split_name}")
84
+ ensure_file_downloaded(
85
+ source_url=f"https://huggingface.co/datasets/zrr1999/MELD_Text_Audio/resolve/main/archive/{MELDAudioScenario.SPLIT_NAME_TO_TGZ_FILE_NAME[split_name]}?download=true", # noqa: E501
86
+ target_path=flac_dir,
87
+ unpack=True,
88
+ unpack_type="untar",
89
+ )
90
+
91
+ wav_dir = os.path.join(output_path, f"wav_{split_name}")
92
+ ensure_directory_exists(wav_dir)
93
+ for row in tqdm(df.itertuples()):
94
+ # Transcode FLAC to WAV
95
+ wav_file_name = f"dia{row.Dialogue_ID}_utt{row.Utterance_ID}.wav"
96
+ wav_file_path = os.path.join(wav_dir, wav_file_name)
97
+ if not os.path.isfile(wav_file_path):
98
+ flac_file_name = f"dia{row.Dialogue_ID}_utt{row.Utterance_ID}.flac"
99
+ flac_file_path = os.path.join(flac_dir, flac_file_name)
100
+ audio_array = get_array_from_audio_file(flac_file_path, MELDAudioScenario.SAMPLE_RATE)
101
+ ensure_audio_file_exists_from_array(wav_file_path, audio_array, MELDAudioScenario.SAMPLE_RATE)
102
+ input = Input(
103
+ multimedia_content=MultimediaObject(
104
+ media_objects=[MediaObject(location=wav_file_path, content_type="audio/wav")]
105
+ )
106
+ )
107
+ assert row.Emotion in MELDAudioScenario.LABEL_NAMES
108
+ references = [Reference(output=Output(text=row.Emotion), tags=[CORRECT_TAG])]
109
+ instance = Instance(
110
+ id=str(f"awoo{row.serial_number}"), input=input, references=references, split=split_name
111
+ )
112
+ instances.append(instance)
113
+ return instances