crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,254 @@
1
+ from io import BytesIO
2
+ from typing import List
3
+ import os
4
+ import requests
5
+
6
+ from pydub import AudioSegment
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+
10
+ from helm.benchmark.scenarios.scenario import (
11
+ TEST_SPLIT,
12
+ Scenario,
13
+ Instance,
14
+ Reference,
15
+ CORRECT_TAG,
16
+ Input,
17
+ Output,
18
+ )
19
+ from helm.common.audio_utils import is_invalid_audio_file
20
+ from helm.common.media_object import MediaObject, MultimediaObject
21
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
22
+ from helm.common.hierarchical_logger import hlog, htrack_block
23
+
24
+
25
+ class MuToxScenario(Scenario):
26
+ """
27
+ MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector
28
+
29
+ MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k
30
+ audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this
31
+ dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across
32
+ a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than
33
+ 1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based
34
+ classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times.
35
+
36
+ Languages:
37
+ "Arabic": "arb",
38
+ "Bengali": "ben",
39
+ "Bulgarian": "bul",
40
+ "Catalan": "cat",
41
+ "Czech": "ces",
42
+ "Mandarin Chinese": "cmn",
43
+ "Danish": "dan",
44
+ "German": "deu",
45
+ "Greek": "ell",
46
+ "English": "eng",
47
+ "Estonian": "est",
48
+ "Western Persian": "fas",
49
+ "Finnish": "fin",
50
+ "French": "fra",
51
+ "Hebrew": "heb",
52
+ "Hindi": "hin",
53
+ "Hungarian": "hun",
54
+ "Indonesian": "ind",
55
+ "Italian": "ita",
56
+ "Dutch": "nld",
57
+ "Polish": "pol",
58
+ "Portuguese": "por",
59
+ "Russian": "rus",
60
+ "Spanish": "spa",
61
+ "Slovak": "slk",
62
+ "Swahili": "swh",
63
+ "Tagalog": "tgl",
64
+ "Turkish": "tur",
65
+ "Urdu": "urd",
66
+ "Vietnamese": "vie",
67
+
68
+ The columns of the dataset are:
69
+
70
+ id: a string id of the segment;
71
+ lang: 3-letter language code;
72
+ partition: one of train, dev, or devtest
73
+ public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds;
74
+ audio_file_transcript: text transctiption of the segment;
75
+ contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings
76
+ label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise;
77
+ etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher;
78
+ detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1);
79
+ mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions
80
+ as float numbers with any value (they can be interpreted as logits,
81
+ i.e. probabilities before a sigmoid transformation).
82
+
83
+ Citation:
84
+
85
+ @misc{costajussà2023mutox,
86
+ title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector},
87
+ author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti,
88
+ Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood},
89
+ year={2023},
90
+ eprint={},
91
+ archivePrefix={arXiv},
92
+ primaryClass={cs.CL}
93
+ }
94
+ """
95
+
96
+ ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv"
97
+
98
+ LANGAUGE_CODES = {
99
+ "Arabic": "arb",
100
+ "Bengali": "ben",
101
+ "Bulgarian": "bul",
102
+ "Catalan": "cat",
103
+ "Czech": "ces",
104
+ "Mandarin_Chinese": "cmn",
105
+ "Danish": "dan",
106
+ "German": "deu",
107
+ "Greek": "ell",
108
+ "English": "eng",
109
+ "Estonian": "est",
110
+ "Western_Persian": "fas",
111
+ "Finnish": "fin",
112
+ "French": "fra",
113
+ "Hebrew": "heb",
114
+ "Hindi": "hin",
115
+ "Hungarian": "hun",
116
+ "Indonesian": "ind",
117
+ "Italian": "ita",
118
+ "Dutch": "nld",
119
+ "Polish": "pol",
120
+ "Portuguese": "por",
121
+ "Russian": "rus",
122
+ "Spanish": "spa",
123
+ "Slovak": "slk",
124
+ "Swahili": "swh",
125
+ "Tagalog": "tgl",
126
+ "Turkish": "tur",
127
+ "Urdu": "urd",
128
+ "Vietnamese": "vie",
129
+ }
130
+
131
+ name = "mutox"
132
+ description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))."
133
+ tags = ["audio", "classification", "toxicity "]
134
+
135
+ @staticmethod
136
+ def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None:
137
+ """
138
+ Many of the links do not exist or point to broken so we keep track of them
139
+ and skip them in the future runs to significantly speed up gathering the instances.
140
+ """
141
+ with open(output_path, "a") as f:
142
+ f.write(bad_audio_file + "\n")
143
+
144
+ def __init__(self, language: str) -> None:
145
+ super().__init__()
146
+ self._language_code: str = self.LANGAUGE_CODES[language]
147
+
148
+ def get_instances(self, output_path: str) -> List[Instance]:
149
+ # Download the annotations
150
+ annotations_path: str = os.path.join(output_path, "mutox.tsv")
151
+ ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
152
+
153
+ # Read bad audio files
154
+ bad_audio_files: set[str] = set()
155
+ bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt")
156
+ if os.path.exists(bad_audio_files_path):
157
+ # Each line is the audio file name
158
+ with open(bad_audio_files_path, "r") as f:
159
+ for line in f:
160
+ bad_audio_files.add(line.strip())
161
+ hlog(f"Found {len(bad_audio_files)} bad audio files.")
162
+
163
+ # Where the audio files will be downloaded to
164
+ audio_path: str = os.path.join(output_path, "audio")
165
+ ensure_directory_exists(audio_path)
166
+
167
+ instances: List[Instance] = []
168
+ df = pd.read_csv(annotations_path, delimiter="\t")
169
+ hlog(f"Found {len(df)} rows in the dataset")
170
+
171
+ valid_count: int = 0
172
+ total_count: int = 0
173
+ for row in tqdm(df.itertuples()):
174
+ # Only proces examples that are in devtest and the language we're interested in
175
+ if row.partition != "devtest":
176
+ continue
177
+
178
+ if row.lang != self._language_code:
179
+ continue
180
+
181
+ total_count += 1
182
+
183
+ # Discard known bad audio files
184
+ audio_filename: str = f"{row.id}.mp3"
185
+ with htrack_block(f"Processing audio file: {audio_filename}"):
186
+ if audio_filename in bad_audio_files:
187
+ hlog(f"Skipping this example -- known bad audio file: {audio_filename}")
188
+ continue
189
+
190
+ local_audio_path: str = os.path.join(audio_path, audio_filename)
191
+ if not os.path.exists(local_audio_path):
192
+ # The provided URL has the complete audio, so we need to download it and clip it
193
+ # public_url_segment: a string formatted as url:start:end,
194
+ if not isinstance(row.public_url_segment, str):
195
+ # Sometimes URL is just a float causing an error. Skip those.
196
+ hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
197
+ continue
198
+
199
+ parts = row.public_url_segment.split()
200
+ if len(parts) != 3:
201
+ hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
202
+ continue
203
+
204
+ audio_url, start_ms_str, end_ms_str = parts
205
+ start_ms, end_ms = int(start_ms_str), int(end_ms_str)
206
+
207
+ # Download the full audio file
208
+ try:
209
+ response = requests.get(audio_url, stream=True)
210
+ response.raise_for_status()
211
+
212
+ # Load audio from the downloaded file
213
+ audio = AudioSegment.from_file(BytesIO(response.content))
214
+
215
+ # Clip the audio
216
+ clipped_audio = audio[start_ms:end_ms]
217
+
218
+ # Save the clipped file
219
+ clipped_audio.export(local_audio_path, format="mp3")
220
+ except Exception as e:
221
+ # Some files are no longer available or invalid
222
+ hlog(
223
+ f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}"
224
+ )
225
+ self.track_bad_audio_file(audio_filename, bad_audio_files_path)
226
+ continue
227
+
228
+ if is_invalid_audio_file(local_audio_path):
229
+ hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}")
230
+ self.track_bad_audio_file(audio_filename, bad_audio_files_path)
231
+ continue
232
+ else:
233
+ valid_count += 1
234
+ hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})")
235
+
236
+ input = Input(
237
+ multimedia_content=MultimediaObject(
238
+ media_objects=[
239
+ # Following what was prompted annotators as outlined in the paper
240
+ MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"),
241
+ MediaObject(location=local_audio_path, content_type="audio/mpeg"),
242
+ ]
243
+ )
244
+ )
245
+
246
+ is_toxic: bool = row.label == 1
247
+ references = [
248
+ Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []),
249
+ Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []),
250
+ ]
251
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
252
+
253
+ assert len(instances) > 0, f"No instances found for language: {self._language_code}"
254
+ return instances
@@ -0,0 +1,97 @@
1
+ from typing import List
2
+ import os
3
+ import json
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from tqdm import tqdm
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_file_downloaded
17
+
18
+
19
+ class PARADEScenario(Scenario):
20
+ """PARADE
21
+
22
+ PARADE dataset is inspired by the PAIRS dataset for evaluating occupation and status bias
23
+ in vision-language models. We collect a new dataset of audio-text multi-choice QA task that
24
+ involves exploring occupation and status bias. The dataset consists of 436 audio-text QA pairs
25
+ with 3 options each.
26
+ """
27
+
28
+ ANNOT_URL = (
29
+ "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/audio_result" "_path_mapping_v2.json"
30
+ )
31
+ MALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/onyx.zip"
32
+ FEMALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/nova.zip"
33
+
34
+ PARADE_INSTRUCTION = "\n\n Answer the question with one of the following options: A, B, or C."
35
+
36
+ SUBSET_LIST = ["occupation", "status"]
37
+ VOICE_MAPPING = {"male": "onyx", "female": "nova"}
38
+
39
+ name = "parade"
40
+ description = "Exploring occupation and status bias in the audio-text multi-choice QA task."
41
+ tags: List[str] = ["audio", "bias"]
42
+
43
+ def __init__(self, subset: str, voice: str) -> None:
44
+ super().__init__()
45
+
46
+ subset = subset.lower()
47
+ voice = voice.lower()
48
+ if subset not in PARADEScenario.SUBSET_LIST:
49
+ raise ValueError(f"Invalid subset. Valid subsets are: {PARADEScenario.SUBSET_LIST}")
50
+
51
+ if voice not in PARADEScenario.VOICE_MAPPING.keys():
52
+ raise ValueError(f"Invalid voice. Valid voices are: {PARADEScenario.VOICE_MAPPING.keys()}")
53
+
54
+ self._subset: str = subset
55
+ self._voice: str = voice
56
+
57
+ def _convert_answer_to_label(self, options: list, answer: str) -> str:
58
+ option_list = ["A", "B", "C"]
59
+ return option_list[options.index(answer)]
60
+
61
+ def get_instances(self, output_path: str) -> List[Instance]:
62
+ instances: List[Instance] = []
63
+ annot_save_path = os.path.join(output_path, "annotation.json")
64
+ ensure_file_downloaded(source_url=PARADEScenario.ANNOT_URL, target_path=annot_save_path)
65
+ annotations = json.load(open(annot_save_path))[self.VOICE_MAPPING[self._voice]][self._subset]
66
+ test_annotations = []
67
+ for key in annotations:
68
+ for key2 in annotations[key]:
69
+ test_annotations.append(annotations[key][key2])
70
+ audio_save_dir = os.path.join(output_path, "audio_files")
71
+ if self._voice == "male":
72
+ ensure_file_downloaded(source_url=PARADEScenario.MALE_AUDIO, target_path=audio_save_dir, unpack=True)
73
+ else:
74
+ ensure_file_downloaded(source_url=PARADEScenario.FEMALE_AUDIO, target_path=audio_save_dir, unpack=True)
75
+ for row in tqdm(test_annotations):
76
+ local_audio_path = os.path.join(output_path, "audio_files", row["path"])
77
+ answer = self._convert_answer_to_label(row["options"], row["label"])
78
+ # The given correct answer is a letter, but we need an index
79
+ correct_answer_index: int = ord(answer) - ord("A")
80
+ references: List[Reference] = []
81
+ question = row["question"]
82
+ for i, option in enumerate(row["options"]):
83
+ reference: Reference
84
+ is_correct: bool = i == correct_answer_index
85
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
86
+ references.append(reference)
87
+
88
+ input = Input(
89
+ multimedia_content=MultimediaObject(
90
+ [
91
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
92
+ MediaObject(content_type="text/plain", text=question + self.PARADE_INSTRUCTION),
93
+ ]
94
+ )
95
+ )
96
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
97
+ return instances
@@ -0,0 +1,124 @@
1
+ """Scenarios for audio models"""
2
+
3
+ from typing import List
4
+ import os
5
+ import json
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from tqdm import tqdm
17
+ from datasets import load_dataset
18
+ from helm.common.media_object import MediaObject, MultimediaObject
19
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
20
+ from helm.common.general import ensure_file_downloaded
21
+
22
+
23
+ class SpeechRobustBenchScenario(Scenario):
24
+ """Speech Robust Bench Scenario
25
+
26
+ Speech Robust Bench (Shah et al, 2024) is a comprehensive benchmark for evaluating
27
+ the robustness of ASR models to diverse corruptions. SRB is composed of 114 input
28
+ perturbations which simulate an heterogeneous range of corruptions that ASR models
29
+ may encounter when deployed in the wild. In this scenario, we select four subsets
30
+ in the benchmark for evaluation, each corresponds to a clean version of audio task.
31
+
32
+ Paper: https://arxiv.org/abs/2403.07937
33
+ Code: https://github.com/ahmedshah1494/speech_robust_bench
34
+
35
+ Citation:
36
+ @article{shah2024speech,
37
+ title={Speech robust bench: A robustness benchmark for speech recognition},
38
+ author={Shah, Muhammad A and Noguero, David Solans and Heikkila, Mikko A and Raj,
39
+ Bhiksha and Kourtellis, Nicolas},
40
+ journal={arXiv preprint arXiv:2403.07937},
41
+ year={2024}
42
+ }
43
+ """
44
+
45
+ HF_DATASET_NAME = "mshah1/speech_robust_bench"
46
+ HF_MAPPING_URL = (
47
+ "https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
48
+ )
49
+
50
+ # Select four subsets of the dataset for the benchmark
51
+ SUBJECTS_DICT = {
52
+ "ami_far": {
53
+ "name": "in-the-wild-AMI",
54
+ "split": "farfield",
55
+ "type": "audio/wav",
56
+ "mapping_key": "srb_aim_field_key2audio",
57
+ },
58
+ "ami_near": {
59
+ "name": "in-the-wild-AMI",
60
+ "split": "nearfield",
61
+ "type": "audio/wav",
62
+ "mapping_key": "srb_aim_field_key2audio",
63
+ },
64
+ "librispeech_gnoise": {
65
+ "name": "librispeech_asr-test.clean_pertEval_500_30",
66
+ "split": "gnoise.1",
67
+ "type": "audio/mp3",
68
+ "mapping_key": "srb_librispeech_noises_key2audio",
69
+ },
70
+ "librispeech_env_noise": {
71
+ "name": "librispeech_asr-test.clean_pertEval_500_30",
72
+ "split": "env_noise_esc50.1",
73
+ "type": "audio/mp3",
74
+ "mapping_key": "srb_librispeech_noises_key2audio",
75
+ },
76
+ }
77
+ # There are 30 different perturbation samples for each LibriSpeech ID
78
+ PERTURBATION_LEVELS = list(range(1, 31))
79
+ name = "speech_robust_bench"
80
+ description = (
81
+ "Speech recognition for 4 datasets with a wide range of corruptions"
82
+ "([Shah et al, 2024](https://arxiv.org/abs/2403.07937))."
83
+ )
84
+ tags: List[str] = ["audio", "recognition", "robustness", "multilinguality"]
85
+
86
+ def __init__(self, subject: str, level: int) -> None:
87
+ super().__init__()
88
+
89
+ self._subject = subject
90
+ if self._subject not in SpeechRobustBenchScenario.SUBJECTS_DICT.keys():
91
+ raise ValueError(f"Invalid subject. Valid subjects are: {SpeechRobustBenchScenario.SUBJECTS_DICT.keys()}")
92
+ self._level = level
93
+ if self._level not in SpeechRobustBenchScenario.PERTURBATION_LEVELS:
94
+ raise ValueError(f"Invalid level. Valid levels are: {SpeechRobustBenchScenario.PERTURBATION_LEVELS}")
95
+
96
+ def get_instances(self, output_path: str) -> List[Instance]:
97
+ instances: List[Instance] = []
98
+ subject_name = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["name"]
99
+ subject_split = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["split"]
100
+ subject_type = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["type"]
101
+ subject_audio_type = subject_type.split("/")[-1]
102
+ subject_mapping = SpeechRobustBenchScenario.SUBJECTS_DICT[self._subject]["mapping_key"]
103
+ audio_save_dir = os.path.join(output_path, "audio_files")
104
+ mapping_local_path = os.path.join(output_path, "srb_instance_keys.json")
105
+ ensure_file_downloaded(source_url=SpeechRobustBenchScenario.HF_MAPPING_URL, target_path=mapping_local_path)
106
+ mapping_keys = json.load(open(mapping_local_path))[subject_mapping][subject_split]
107
+ meta_data = load_dataset(
108
+ SpeechRobustBenchScenario.HF_DATASET_NAME,
109
+ name=subject_name,
110
+ cache_dir=output_path,
111
+ split=subject_split,
112
+ )
113
+ for line_num in tqdm(list(mapping_keys.keys())):
114
+ row = meta_data[int(mapping_keys[line_num][self._level - 1])]
115
+ local_audio_name = f"{self._subject}_{subject_split}_{line_num}.{subject_audio_type}"
116
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
117
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
118
+ answer = row["text"].lower()
119
+ input = Input(
120
+ multimedia_content=MultimediaObject([MediaObject(content_type=subject_type, location=local_audio_path)])
121
+ )
122
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
123
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
124
+ return instances
@@ -0,0 +1,69 @@
1
+ """Scenarios for audio models"""
2
+
3
+ from typing import List
4
+ import os
5
+
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+ from tqdm import tqdm
16
+
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.general import ensure_file_downloaded
19
+ from helm.common.audio_utils import is_invalid_audio_file
20
+
21
+
22
+ class VocalSoundScenario(Scenario):
23
+ """Vocal Sound Scenario
24
+
25
+ The VocalSound (Gong et al, 2022) dataset consists of 21,000 crowdsourced recordings
26
+ of laughter, sighs, coughs, throat clearing, sneezes, and sniffs from 3,365 unique subjects.
27
+ The task is to classify the human behaviour from the audio sample.
28
+
29
+ Paper: https://arxiv.org/abs/2205.03433
30
+ Code: https://github.com/YuanGongND/vocalsound
31
+
32
+ Citation:
33
+ @INPROCEEDINGS{gong_vocalsound,
34
+ author={Gong, Yuan and Yu, Jin and Glass, James},
35
+ booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
36
+ title={Vocalsound: A Dataset for Improving Human Vocal Sounds Recognition},
37
+ year={2022},
38
+ pages={151-155},
39
+ doi={10.1109/ICASSP43922.2022.9746828}
40
+ }
41
+ """ # noqa: E501
42
+
43
+ DOWNLOADING_URL = "https://www.dropbox.com/s/c5ace70qh1vbyzb/vs_release_16k.zip"
44
+
45
+ name = "vocal_sound"
46
+ description = "Classify an audio sample of a spoken digit ([Gong et al, 2022](https://arxiv.org/abs/2205.03433))."
47
+ tags: List[str] = ["audio", "classification"]
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ instances: List[Instance] = []
51
+ down_loading_path = os.path.join(output_path, "download")
52
+ ensure_file_downloaded(VocalSoundScenario.DOWNLOADING_URL, down_loading_path, unpack=True)
53
+ wav_save_dir = os.path.join(down_loading_path, "audio_16k")
54
+ for file_name in tqdm(os.listdir(wav_save_dir)):
55
+ local_audio_path: str = os.path.join(wav_save_dir, file_name)
56
+ if not file_name.endswith(".wav") or is_invalid_audio_file(local_audio_path):
57
+ continue
58
+
59
+ input = Input(
60
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
61
+ )
62
+
63
+ answer: str = file_name.split("_")[-1].split(".")[0]
64
+ if answer == "throatclearing":
65
+ answer = "throat clearing"
66
+
67
+ references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
68
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
69
+ return instances
@@ -0,0 +1,87 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ TEST_SPLIT,
6
+ Scenario,
7
+ Instance,
8
+ Input,
9
+ )
10
+ from helm.common.audio_utils import is_invalid_audio_file
11
+ from helm.common.media_object import MediaObject, MultimediaObject
12
+
13
+
14
+ class VoiceJailbreakAttacksScenario(Scenario):
15
+ """
16
+ Voice Jailbreak Attacks Against GPT-4o
17
+
18
+ Paper: https://arxiv.org/abs/2405.19103
19
+
20
+ The examples used in this scenario was generated following the instructions in the GitHub:
21
+ https://github.com/TrustAIRLab/VoiceJailbreakAttack
22
+ Note: The more advanced jailbreaking subset addresses the AI as "ChatGPT".
23
+
24
+ We ran:
25
+ python tts/prompt2audio.py --dataset baseline --voice fable
26
+ python tts/prompt2audio.py --dataset baseline --voice fable
27
+
28
+ then placed the generated folders at benchmark_output/scenarios/voice_jailbreak_attacks:
29
+
30
+ voice_jailbreak_attacks/
31
+ baseline_fable/
32
+ *.wav
33
+ textjailbreak_fable/
34
+ *.wav
35
+
36
+ Base prompts (30 total):
37
+ https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/question_set/questions_tiny.csv
38
+ Jailbreak templates:
39
+ https://github.com/TrustAIRLab/VoiceJailbreakAttack/blob/main/data/jailbreak_prompts/text_jailbreak_prompts.csv
40
+
41
+ The authors used Attack Success Rate (ASR) as the evaluation metric.
42
+ Specifically, two authors manually label each response to determine if it answers the forbidden
43
+ question. If there is any uncertainty, the authors discuss to reach a conclusion
44
+
45
+ Citation:
46
+
47
+ @article{SWBZ24,
48
+ author = {Xinyue Shen and Yixin Wu and Michael Backes and Yang Zhang},
49
+ title = {{Voice Jailbreak Attacks Against GPT-4o}},
50
+ journal = {{CoRR abs/2405.19103}},
51
+ year = {2024}
52
+ }
53
+ """
54
+
55
+ VALID_SUBSETS = ["baseline", "textjailbreak"]
56
+
57
+ name = "voice_jailbreak_attacks"
58
+ description = "Voice jailbreak attacks against GPT-4o ([Shen et al, 2024](https://arxiv.org/abs/2405.19103))."
59
+ tags = ["audio", "safety"]
60
+
61
+ def __init__(self, subset: str = "baseline") -> None:
62
+ super().__init__()
63
+ assert subset in self.VALID_SUBSETS, f"Invalid subset: {subset}"
64
+ self._subset = subset
65
+
66
+ def get_instances(self, output_path: str) -> List[Instance]:
67
+ audio_directory_path: str = os.path.join(output_path, f"{self._subset}_fable")
68
+ assert os.path.exists(audio_directory_path), f"Audio directory does not exist: {audio_directory_path}"
69
+
70
+ instances: List[Instance] = []
71
+ for file in os.listdir(audio_directory_path):
72
+ if not file.endswith("wav"):
73
+ continue
74
+
75
+ audio_path: str = os.path.join(audio_directory_path, file)
76
+ assert not is_invalid_audio_file(audio_path), f"Invalid audio file: {audio_path}"
77
+
78
+ input = Input(
79
+ multimedia_content=MultimediaObject(
80
+ media_objects=[
81
+ MediaObject(location=audio_path, content_type="audio/wav"),
82
+ ]
83
+ )
84
+ )
85
+ instances.append(Instance(input=input, references=[], split=TEST_SPLIT))
86
+
87
+ return instances