crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,80 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+ from tqdm import tqdm
14
+ from datasets import load_dataset
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
17
+
18
+
19
+ class MultilingualLibriSpeechScenario(Scenario):
20
+ """Multilingual Librispeech
21
+
22
+ The Multilingual LibriSpeech (Pratap et al, 2020) dataset is derived from read audiobooks
23
+ from LibriVox and consists of 8 languages, including about 44.5K hours of English and a total
24
+ of about 6K hours for other 7 languages. The task is to recognize the textual content from the
25
+ audio sample.
26
+
27
+ Paper: https://arxiv.org/abs/2012.03411
28
+ Code: https://www.openslr.org/
29
+
30
+ Citation:
31
+ @article{Pratap2020MLSAL,
32
+ title={MLS: A Large-Scale Multilingual Dataset for Speech Research},
33
+ author={Vineel Pratap and Qiantong Xu and Anuroop Sriram and Gabriel Synnaeve and Ronan Collobert},
34
+ journal={ArXiv},
35
+ year={2020},
36
+ volume={abs/2012.03411}
37
+ }
38
+ """
39
+
40
+ HF_DATASET_NAME = "facebook/multilingual_librispeech"
41
+ LANGUAGE_LIST: List[str] = ["dutch", "german", "french", "spanish", "italian", "portuguese", "polish"]
42
+
43
+ name = "multilingual_librispeech"
44
+ description = (
45
+ "Speech recognition in 7 different languages ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411))."
46
+ )
47
+ tags: List[str] = ["audio", "multilinguality", "recognition"]
48
+
49
+ def __init__(self, language: str) -> None:
50
+ super().__init__()
51
+
52
+ language = language.lower()
53
+ if language not in MultilingualLibriSpeechScenario.LANGUAGE_LIST:
54
+ raise ValueError(f"Invalid language. Valid languages are: {MultilingualLibriSpeechScenario.LANGUAGE_LIST}")
55
+
56
+ self._language: str = language
57
+
58
+ def get_instances(self, output_path: str) -> List[Instance]:
59
+ instances: List[Instance] = []
60
+ audio_save_dir = os.path.join(output_path, "audio_files")
61
+ for idx, row in enumerate(
62
+ tqdm(
63
+ load_dataset(
64
+ MultilingualLibriSpeechScenario.HF_DATASET_NAME,
65
+ name=self._language,
66
+ cache_dir=output_path,
67
+ split=TEST_SPLIT,
68
+ )
69
+ )
70
+ ):
71
+ local_audio_path = os.path.join(audio_save_dir, str(idx) + "_" + row["original_path"].split("/")[-1])
72
+ # download to the local path
73
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
74
+ answer = row["transcript"]
75
+ input = Input(
76
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
77
+ )
78
+ references = [Reference(Output(text=str(answer)), tags=[CORRECT_TAG])]
79
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
80
+ return instances
@@ -0,0 +1,142 @@
1
+ import json
2
+ import os
3
+ from typing import List
4
+
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ TEST_SPLIT,
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.audio_utils import is_invalid_audio_file, extract_audio
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
19
+
20
+
21
+ class MUStARDScenario(Scenario):
22
+ """
23
+ MUStARD: Multimodal Sarcasm Detection Dataset
24
+
25
+ A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
26
+ TV shows including Friends, The Golden Girls, The Big Bang Theory, and Sarcasmaholics Anonymous. MUStARD consists
27
+ of audiovisual utterances annotated with sarcasm labels. Each utterance is accompanied by its context, providing
28
+ additional information on the scenario where it occurs.
29
+
30
+ We just extract the audio from the given videos.
31
+
32
+ The columns of the dataset are:
33
+ - utterance: The text of the target utterance to classify.
34
+ - speaker: Speaker of the target utterance.
35
+ - context: List of utterances (in chronological order) preceding the target utterance.
36
+ - context_speakers: Respective speakers of the context utterances.
37
+ - sarcasm: Binary label for sarcasm tag.
38
+
39
+ More specifically an example looks like this:
40
+
41
+ "1_60": {
42
+ "utterance": "It's just a privilege to watch your mind at work.",
43
+ "speaker": "SHELDON",
44
+ "context": [
45
+ "I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.",
46
+ "My apologies. What's your plan?"
47
+ ],
48
+ "context_speakers": [
49
+ "LEONARD",
50
+ "SHELDON"
51
+ ],
52
+ "show": "BBT",
53
+ "sarcasm": true
54
+ }
55
+
56
+ The key is the video id.
57
+
58
+ The video folder has two subfolders:
59
+ - context_final: Contains the context videos (e.g., 1_60_c.mp4)
60
+ - utterances_final: Contains the target utterance videos (e.g., 1_60.mp4)
61
+
62
+ Citation:
63
+
64
+ @inproceedings{mustard,
65
+ title = "Towards Multimodal Sarcasm Detection (An \_Obviously\_ Perfect Paper)",
66
+ author = "Castro, Santiago and
67
+ Hazarika, Devamanyu and
68
+ P{\'e}rez-Rosas, Ver{\'o}nica and
69
+ Zimmermann, Roger and
70
+ Mihalcea, Rada and
71
+ Poria, Soujanya",
72
+ booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
73
+ (Volume 1: Long Papers)",
74
+ month = "7",
75
+ year = "2019",
76
+ address = "Florence, Italy",
77
+ publisher = "Association for Computational Linguistics",
78
+ }
79
+ """
80
+
81
+ RAW_VIDEO_CLIPS_URL: str = "https://huggingface.co/datasets/MichiganNLP/MUStARD/resolve/main/mmsd_raw_data.zip"
82
+ ANNOTATIONS_URL: str = (
83
+ "https://raw.githubusercontent.com/soujanyaporia/MUStARD/refs/heads/master/data/" "sarcasm_data.json"
84
+ )
85
+
86
+ name = "mustard"
87
+ description = "Sarcasm detection benchmark ([Castro et al, 2018](https://arxiv.org/abs/1906.01815))."
88
+ tags = ["audio", "classification", "toxicity", "sarcasm detection"]
89
+
90
+ def get_instances(self, output_path: str) -> List[Instance]:
91
+ # Download the annotations
92
+ annotations_path: str = os.path.join(output_path, "sarcasm_data.json")
93
+ ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
94
+
95
+ # Where the video files will be downloaded to
96
+ video_path: str = os.path.join(output_path, "video")
97
+ ensure_file_downloaded(self.RAW_VIDEO_CLIPS_URL, video_path, unpack=True)
98
+
99
+ # Where the audio files will be extracted to
100
+ audio_path: str = os.path.join(output_path, "audio")
101
+ ensure_directory_exists(audio_path)
102
+
103
+ instances: List[Instance] = []
104
+ annotations = json.load(open(annotations_path, "r"))
105
+ for key, row in tqdm(annotations.items()):
106
+ # Extract the audio from the context video
107
+ context_audio_path: str = os.path.join(audio_path, f"{key}_c.mp3")
108
+ if not os.path.exists(context_audio_path):
109
+ # Extract the audio from the video
110
+ context_video_path: str = os.path.join(video_path, "context_final", f"{key}_c.mp4")
111
+ extract_audio(context_video_path, context_audio_path)
112
+ assert not is_invalid_audio_file(context_audio_path), f"Invalid audio file: {context_audio_path}"
113
+
114
+ # Extract the audio from the target utterance video
115
+ utterance_audio_path: str = os.path.join(audio_path, f"{key}.mp3")
116
+ if not os.path.exists(utterance_audio_path):
117
+ utterance_video_path: str = os.path.join(video_path, "utterances_final", f"{key}.mp4")
118
+ extract_audio(utterance_video_path, utterance_audio_path)
119
+ assert not is_invalid_audio_file(utterance_audio_path), f"Invalid audio file: {utterance_audio_path}"
120
+
121
+ input = Input(
122
+ multimedia_content=MultimediaObject(
123
+ media_objects=[
124
+ # Input both the context and the utterance audio
125
+ MediaObject(text="Context:", content_type="text/plain"),
126
+ MediaObject(location=context_audio_path, content_type="audio/mpeg"),
127
+ MediaObject(text="Utterance:", content_type="text/plain"),
128
+ MediaObject(location=utterance_audio_path, content_type="audio/mpeg"),
129
+ MediaObject(
130
+ text="Given the context, does the utterance contain sarcasm?", content_type="text/plain"
131
+ ),
132
+ ]
133
+ )
134
+ )
135
+ is_sarcastic: bool = row["sarcasm"]
136
+ references = [
137
+ Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_sarcastic else []),
138
+ Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_sarcastic else []),
139
+ ]
140
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
141
+
142
+ return instances
@@ -0,0 +1,254 @@
1
+ from io import BytesIO
2
+ from typing import List
3
+ import os
4
+ import requests
5
+
6
+ from pydub import AudioSegment
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+
10
+ from helm.benchmark.scenarios.scenario import (
11
+ TEST_SPLIT,
12
+ Scenario,
13
+ Instance,
14
+ Reference,
15
+ CORRECT_TAG,
16
+ Input,
17
+ Output,
18
+ )
19
+ from helm.common.audio_utils import is_invalid_audio_file
20
+ from helm.common.media_object import MediaObject, MultimediaObject
21
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
22
+ from helm.common.hierarchical_logger import hlog, htrack_block
23
+
24
+
25
+ class MuToxScenario(Scenario):
26
+ """
27
+ MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector
28
+
29
+ MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k
30
+ audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this
31
+ dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across
32
+ a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than
33
+ 1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based
34
+ classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times.
35
+
36
+ Languages:
37
+ "Arabic": "arb",
38
+ "Bengali": "ben",
39
+ "Bulgarian": "bul",
40
+ "Catalan": "cat",
41
+ "Czech": "ces",
42
+ "Mandarin Chinese": "cmn",
43
+ "Danish": "dan",
44
+ "German": "deu",
45
+ "Greek": "ell",
46
+ "English": "eng",
47
+ "Estonian": "est",
48
+ "Western Persian": "fas",
49
+ "Finnish": "fin",
50
+ "French": "fra",
51
+ "Hebrew": "heb",
52
+ "Hindi": "hin",
53
+ "Hungarian": "hun",
54
+ "Indonesian": "ind",
55
+ "Italian": "ita",
56
+ "Dutch": "nld",
57
+ "Polish": "pol",
58
+ "Portuguese": "por",
59
+ "Russian": "rus",
60
+ "Spanish": "spa",
61
+ "Slovak": "slk",
62
+ "Swahili": "swh",
63
+ "Tagalog": "tgl",
64
+ "Turkish": "tur",
65
+ "Urdu": "urd",
66
+ "Vietnamese": "vie",
67
+
68
+ The columns of the dataset are:
69
+
70
+ id: a string id of the segment;
71
+ lang: 3-letter language code;
72
+ partition: one of train, dev, or devtest
73
+ public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds;
74
+ audio_file_transcript: text transctiption of the segment;
75
+ contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings
76
+ label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise;
77
+ etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher;
78
+ detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1);
79
+ mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions
80
+ as float numbers with any value (they can be interpreted as logits,
81
+ i.e. probabilities before a sigmoid transformation).
82
+
83
+ Citation:
84
+
85
+ @misc{costajussà2023mutox,
86
+ title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector},
87
+ author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti,
88
+ Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood},
89
+ year={2023},
90
+ eprint={},
91
+ archivePrefix={arXiv},
92
+ primaryClass={cs.CL}
93
+ }
94
+ """
95
+
96
+ ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv"
97
+
98
+ LANGAUGE_CODES = {
99
+ "Arabic": "arb",
100
+ "Bengali": "ben",
101
+ "Bulgarian": "bul",
102
+ "Catalan": "cat",
103
+ "Czech": "ces",
104
+ "Mandarin_Chinese": "cmn",
105
+ "Danish": "dan",
106
+ "German": "deu",
107
+ "Greek": "ell",
108
+ "English": "eng",
109
+ "Estonian": "est",
110
+ "Western_Persian": "fas",
111
+ "Finnish": "fin",
112
+ "French": "fra",
113
+ "Hebrew": "heb",
114
+ "Hindi": "hin",
115
+ "Hungarian": "hun",
116
+ "Indonesian": "ind",
117
+ "Italian": "ita",
118
+ "Dutch": "nld",
119
+ "Polish": "pol",
120
+ "Portuguese": "por",
121
+ "Russian": "rus",
122
+ "Spanish": "spa",
123
+ "Slovak": "slk",
124
+ "Swahili": "swh",
125
+ "Tagalog": "tgl",
126
+ "Turkish": "tur",
127
+ "Urdu": "urd",
128
+ "Vietnamese": "vie",
129
+ }
130
+
131
+ name = "mutox"
132
+ description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))."
133
+ tags = ["audio", "classification", "toxicity "]
134
+
135
+ @staticmethod
136
+ def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None:
137
+ """
138
+ Many of the links do not exist or point to broken so we keep track of them
139
+ and skip them in the future runs to significantly speed up gathering the instances.
140
+ """
141
+ with open(output_path, "a") as f:
142
+ f.write(bad_audio_file + "\n")
143
+
144
+ def __init__(self, language: str) -> None:
145
+ super().__init__()
146
+ self._language_code: str = self.LANGAUGE_CODES[language]
147
+
148
+ def get_instances(self, output_path: str) -> List[Instance]:
149
+ # Download the annotations
150
+ annotations_path: str = os.path.join(output_path, "mutox.tsv")
151
+ ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
152
+
153
+ # Read bad audio files
154
+ bad_audio_files: set[str] = set()
155
+ bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt")
156
+ if os.path.exists(bad_audio_files_path):
157
+ # Each line is the audio file name
158
+ with open(bad_audio_files_path, "r") as f:
159
+ for line in f:
160
+ bad_audio_files.add(line.strip())
161
+ hlog(f"Found {len(bad_audio_files)} bad audio files.")
162
+
163
+ # Where the audio files will be downloaded to
164
+ audio_path: str = os.path.join(output_path, "audio")
165
+ ensure_directory_exists(audio_path)
166
+
167
+ instances: List[Instance] = []
168
+ df = pd.read_csv(annotations_path, delimiter="\t")
169
+ hlog(f"Found {len(df)} rows in the dataset")
170
+
171
+ valid_count: int = 0
172
+ total_count: int = 0
173
+ for row in tqdm(df.itertuples()):
174
+ # Only proces examples that are in devtest and the language we're interested in
175
+ if row.partition != "devtest":
176
+ continue
177
+
178
+ if row.lang != self._language_code:
179
+ continue
180
+
181
+ total_count += 1
182
+
183
+ # Discard known bad audio files
184
+ audio_filename: str = f"{row.id}.mp3"
185
+ with htrack_block(f"Processing audio file: {audio_filename}"):
186
+ if audio_filename in bad_audio_files:
187
+ hlog(f"Skipping this example -- known bad audio file: {audio_filename}")
188
+ continue
189
+
190
+ local_audio_path: str = os.path.join(audio_path, audio_filename)
191
+ if not os.path.exists(local_audio_path):
192
+ # The provided URL has the complete audio, so we need to download it and clip it
193
+ # public_url_segment: a string formatted as url:start:end,
194
+ if not isinstance(row.public_url_segment, str):
195
+ # Sometimes URL is just a float causing an error. Skip those.
196
+ hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
197
+ continue
198
+
199
+ parts = row.public_url_segment.split()
200
+ if len(parts) != 3:
201
+ hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
202
+ continue
203
+
204
+ audio_url, start_ms_str, end_ms_str = parts
205
+ start_ms, end_ms = int(start_ms_str), int(end_ms_str)
206
+
207
+ # Download the full audio file
208
+ try:
209
+ response = requests.get(audio_url, stream=True)
210
+ response.raise_for_status()
211
+
212
+ # Load audio from the downloaded file
213
+ audio = AudioSegment.from_file(BytesIO(response.content))
214
+
215
+ # Clip the audio
216
+ clipped_audio = audio[start_ms:end_ms]
217
+
218
+ # Save the clipped file
219
+ clipped_audio.export(local_audio_path, format="mp3")
220
+ except Exception as e:
221
+ # Some files are no longer available or invalid
222
+ hlog(
223
+ f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}"
224
+ )
225
+ self.track_bad_audio_file(audio_filename, bad_audio_files_path)
226
+ continue
227
+
228
+ if is_invalid_audio_file(local_audio_path):
229
+ hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}")
230
+ self.track_bad_audio_file(audio_filename, bad_audio_files_path)
231
+ continue
232
+ else:
233
+ valid_count += 1
234
+ hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})")
235
+
236
+ input = Input(
237
+ multimedia_content=MultimediaObject(
238
+ media_objects=[
239
+ # Following what was prompted annotators as outlined in the paper
240
+ MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"),
241
+ MediaObject(location=local_audio_path, content_type="audio/mpeg"),
242
+ ]
243
+ )
244
+ )
245
+
246
+ is_toxic: bool = row.label == 1
247
+ references = [
248
+ Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []),
249
+ Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []),
250
+ ]
251
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
252
+
253
+ assert len(instances) > 0, f"No instances found for language: {self._language_code}"
254
+ return instances
@@ -0,0 +1,97 @@
1
+ from typing import List
2
+ import os
3
+ import json
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from tqdm import tqdm
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_file_downloaded
17
+
18
+
19
+ class PARADEScenario(Scenario):
20
+ """PARADE
21
+
22
+ PARADE dataset is inspired by the PAIRS dataset for evaluating occupation and status bias
23
+ in vision-language models. We collect a new dataset of audio-text multi-choice QA task that
24
+ involves exploring occupation and status bias. The dataset consists of 436 audio-text QA pairs
25
+ with 3 options each.
26
+ """
27
+
28
+ ANNOT_URL = (
29
+ "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/audio_result" "_path_mapping_v2.json"
30
+ )
31
+ MALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/onyx.zip"
32
+ FEMALE_AUDIO = "https://huggingface.co/datasets/UCSC-VLAA/PARADE_audio/resolve/main/nova.zip"
33
+
34
+ PARADE_INSTRUCTION = "\n\n Answer the question with one of the following options: A, B, or C."
35
+
36
+ SUBSET_LIST = ["occupation", "status"]
37
+ VOICE_MAPPING = {"male": "onyx", "female": "nova"}
38
+
39
+ name = "parade"
40
+ description = "Exploring occupation and status bias in the audio-text multi-choice QA task."
41
+ tags: List[str] = ["audio", "bias"]
42
+
43
+ def __init__(self, subset: str, voice: str) -> None:
44
+ super().__init__()
45
+
46
+ subset = subset.lower()
47
+ voice = voice.lower()
48
+ if subset not in PARADEScenario.SUBSET_LIST:
49
+ raise ValueError(f"Invalid subset. Valid subsets are: {PARADEScenario.SUBSET_LIST}")
50
+
51
+ if voice not in PARADEScenario.VOICE_MAPPING.keys():
52
+ raise ValueError(f"Invalid voice. Valid voices are: {PARADEScenario.VOICE_MAPPING.keys()}")
53
+
54
+ self._subset: str = subset
55
+ self._voice: str = voice
56
+
57
+ def _convert_answer_to_label(self, options: list, answer: str) -> str:
58
+ option_list = ["A", "B", "C"]
59
+ return option_list[options.index(answer)]
60
+
61
+ def get_instances(self, output_path: str) -> List[Instance]:
62
+ instances: List[Instance] = []
63
+ annot_save_path = os.path.join(output_path, "annotation.json")
64
+ ensure_file_downloaded(source_url=PARADEScenario.ANNOT_URL, target_path=annot_save_path)
65
+ annotations = json.load(open(annot_save_path))[self.VOICE_MAPPING[self._voice]][self._subset]
66
+ test_annotations = []
67
+ for key in annotations:
68
+ for key2 in annotations[key]:
69
+ test_annotations.append(annotations[key][key2])
70
+ audio_save_dir = os.path.join(output_path, "audio_files")
71
+ if self._voice == "male":
72
+ ensure_file_downloaded(source_url=PARADEScenario.MALE_AUDIO, target_path=audio_save_dir, unpack=True)
73
+ else:
74
+ ensure_file_downloaded(source_url=PARADEScenario.FEMALE_AUDIO, target_path=audio_save_dir, unpack=True)
75
+ for row in tqdm(test_annotations):
76
+ local_audio_path = os.path.join(output_path, "audio_files", row["path"])
77
+ answer = self._convert_answer_to_label(row["options"], row["label"])
78
+ # The given correct answer is a letter, but we need an index
79
+ correct_answer_index: int = ord(answer) - ord("A")
80
+ references: List[Reference] = []
81
+ question = row["question"]
82
+ for i, option in enumerate(row["options"]):
83
+ reference: Reference
84
+ is_correct: bool = i == correct_answer_index
85
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
86
+ references.append(reference)
87
+
88
+ input = Input(
89
+ multimedia_content=MultimediaObject(
90
+ [
91
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
92
+ MediaObject(content_type="text/plain", text=question + self.PARADE_INSTRUCTION),
93
+ ]
94
+ )
95
+ )
96
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
97
+ return instances