crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,96 @@
1
+ from typing import List
2
+ import os
3
+ import json
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from tqdm import tqdm
15
+ from datasets import load_dataset
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
18
+ from helm.common.general import ensure_file_downloaded
19
+
20
+
21
+ class AMIScenario(Scenario):
22
+ """AMI Meeting Corpus
23
+ The AMI Meeting Corpus (Carletta et al. 2005) is a multi-modal data set consisting of
24
+ 100 hours of meeting recordings. It is being created in the context of a project that
25
+ is developing meeting browsing technology. The corpus is being recorded using a wide
26
+ range of devices including close-talking and far-field microphones, individual and
27
+ room-view video cameras, projection, a whiteboard, and individual pens, all of which
28
+ produce output signals that are synchronized with each other.
29
+
30
+ Paper: https://link.springer.com/chapter/10.1007/11677482_3
31
+ Code: https://groups.inf.ed.ac.uk/ami/corpus/
32
+
33
+ Citation:
34
+ @inproceedings{Carletta2005TheAM,
35
+ title={The AMI Meeting Corpus: A Pre-announcement},
36
+ author={Jean Carletta and Simone Ashby and Sebastien Bourban and Mike Flynn and Ma{\"e}l Guillemot
37
+ and Thomas Hain and Jaroslav Kadlec and Vasilis Karaiskos and Wessel Kraaij and Melissa Kronenthal
38
+ and Guillaume Lathoud and Mike Lincoln and Agnes Lisowska Masson and Iain McCowan and Wilfried Post
39
+ and Dennis Reidsma and Pierre D. Wellner},
40
+ booktitle={Machine Learning for Multimodal Interaction},
41
+ year={2005},
42
+ url={https://api.semanticscholar.org/CorpusID:6118869}
43
+ }
44
+ """
45
+
46
+ HF_DATASET_NAME = "edinburghcstr/ami"
47
+ HF_MAPPING_URL = (
48
+ "https://huggingface.co/datasets/PahaII/SRB_instance_key_mapping/resolve/main/srb_instance_keys.json"
49
+ )
50
+ name = "ami"
51
+ description = (
52
+ "Speech recognition of speech recorded with different devices "
53
+ "([Carletta et al, 2005](https://link.springer.com/chapter/10.1007/11677482_3))."
54
+ )
55
+ SUBJECT_DICT = {
56
+ "ihm": {"mapping_key": "ami_ihm_id2line", "srb_mapping": "nearfield"},
57
+ "sdm": {"mapping_key": "ami_sdm_id2line", "srb_mapping": "farfield"},
58
+ }
59
+ tags: List[str] = ["audio", "recognition"]
60
+
61
+ def __init__(self, subject: str) -> None:
62
+ super().__init__()
63
+ subject = subject.lower()
64
+ if subject not in AMIScenario.SUBJECT_DICT.keys():
65
+ raise ValueError(f"Invalid subject. Valid subjects are: {AMIScenario.SUBJECT_DICT.keys()}")
66
+
67
+ self._subject: str = subject
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ instances: List[Instance] = []
71
+ audio_save_dir = os.path.join(output_path, "audio_files")
72
+ mapping_local_path = os.path.join(output_path, "srb_instance_keys.json")
73
+ ensure_file_downloaded(source_url=AMIScenario.HF_MAPPING_URL, target_path=mapping_local_path)
74
+ meta_data = load_dataset(
75
+ AMIScenario.HF_DATASET_NAME,
76
+ name=self._subject,
77
+ cache_dir=output_path,
78
+ split=TEST_SPLIT,
79
+ )
80
+ index_mappings = AMIScenario.SUBJECT_DICT[self._subject]["mapping_key"]
81
+ srb_mappings = AMIScenario.SUBJECT_DICT[self._subject]["srb_mapping"]
82
+ mapping_dict = json.load(open(mapping_local_path))
83
+ srb_mapping_keys = mapping_dict["srb_aim_field_key2audio"][srb_mappings]
84
+ index2line_num = mapping_dict[index_mappings]
85
+ for line_num in tqdm(list(srb_mapping_keys)):
86
+ row = meta_data[int(index2line_num[line_num])]
87
+ local_audio_name = f"{self._subject}_{line_num}.wav"
88
+ local_audio_path = os.path.join(audio_save_dir, local_audio_name)
89
+ ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
90
+ answer = row["text"].lower()
91
+ input = Input(
92
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
93
+ )
94
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
95
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
96
+ return instances
@@ -0,0 +1,62 @@
1
+ """Scenarios for audio models"""
2
+
3
+ from typing import List
4
+ import os
5
+
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TEST_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+ from tqdm import tqdm
16
+ from datasets import load_dataset
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.general import ensure_directory_exists
19
+ from helm.common.audio_utils import ensure_audio_file_exists_from_array
20
+
21
+
22
+ class AudioMNISTScenario(Scenario):
23
+ """AudioMNIST
24
+
25
+ The AudioMNIST (Becker et al, 2023) dataset consists of a dataset of 30000 audio samples of
26
+ spoken digits (0-9) of 60 different speakers. The task is to classify the digit from the
27
+ audio sample.
28
+
29
+ Paper: https://arxiv.org/abs/1807.03418
30
+ Code: https://github.com/soerenab/AudioMNIST
31
+
32
+ Citation:
33
+ @article{audiomnist2023,
34
+ title = {AudioMNIST: Exploring Explainable Artificial Intelligence for audio analysis on a simple benchmark},
35
+ journal = {Journal of the Franklin Institute},
36
+ year = {2023},
37
+ issn = {0016-0032},
38
+ doi = {https://doi.org/10.1016/j.jfranklin.2023.11.038},
39
+ url = {https://www.sciencedirect.com/science/article/pii/S0016003223007536},
40
+ author = {Sören Becker and Johanna Vielhaben and Marcel Ackermann and Klaus-Robert Müller and Sebastian Lapuschkin and Wojciech Samek},
41
+ keywords = {Deep learning, Neural networks, Interpretability, Explainable artificial intelligence, Audio classification, Speech recognition},
42
+ }
43
+ """ # noqa: E501
44
+
45
+ name = "audio_mnist"
46
+ description = "Classify an audio sample of a spoken digit ([Becker et al, 2023](https://arxiv.org/abs/1807.03418))."
47
+ tags = ["audio", "classification"]
48
+
49
+ def get_instances(self, output_path: str) -> List[Instance]:
50
+ instances: List[Instance] = []
51
+ wav_save_dir: str = os.path.join(output_path, "audio_files")
52
+ ensure_directory_exists(wav_save_dir)
53
+ for row in tqdm(load_dataset("flexthink/audiomnist", cache_dir=output_path, split=TEST_SPLIT)):
54
+ local_audio_path = os.path.join(wav_save_dir, row["audio"]["path"])
55
+ audio_array = row["audio"]["array"]
56
+ ensure_audio_file_exists_from_array(local_audio_path, audio_array, row["audio"]["sampling_rate"])
57
+ input = Input(
58
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
59
+ )
60
+ references = [Reference(Output(text=str(row["digit"])), tags=[CORRECT_TAG])]
61
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
62
+ return instances
@@ -0,0 +1,62 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+ from tqdm import tqdm
14
+ from helm.common.media_object import MediaObject, MultimediaObject
15
+ from helm.common.general import ensure_file_downloaded
16
+ import json
17
+
18
+
19
+ class AudioPAIRSScenario(Scenario):
20
+ """Audio PAIRS
21
+
22
+ Audio PAIRS is an audio extension of the PAIRS dataset (Fraser et al, 2024) to examine gender and
23
+ racial bias in audio large language models. We convert the questions in the PAIRS dataset to audio
24
+ clips using OpenAI's TTS-1-HD API.
25
+
26
+ This dataset is also modified to add an option to opt-out with "unclear" as a choice.
27
+ """
28
+
29
+ DOWNLOADING_URL = "https://huggingface.co/datasets/UCSC-VLAA/Audio_PAIRS/resolve/main/audio_pairs_files.zip"
30
+ SUJECTS = ["occupation", "status", "potential_crime"]
31
+
32
+ name = "audio_pairs"
33
+ description = "Examining gender and racial bias in AudioLMs using a converted audio from the PAIRS dataset."
34
+ tags: List[str] = ["audio", "classification"]
35
+
36
+ def __init__(self, subject: str) -> None:
37
+ super().__init__()
38
+
39
+ if subject not in AudioPAIRSScenario.SUJECTS:
40
+ raise ValueError(f"Invalid subject. Valid subjects are: {AudioPAIRSScenario.SUJECTS}")
41
+
42
+ self._subject: str = subject
43
+
44
+ def get_instances(self, output_path: str) -> List[Instance]:
45
+ instances: List[Instance] = []
46
+ downloading_dir: str = os.path.join(output_path, "download")
47
+ ensure_file_downloaded(source_url=AudioPAIRSScenario.DOWNLOADING_URL, target_path=downloading_dir, unpack=True)
48
+ data_dir: str = os.path.join(downloading_dir, "audio_pairs_files")
49
+ audio_file_folder = os.path.join(data_dir, self._subject)
50
+ audio_instruction_path = os.path.join(data_dir, "audio_pairs_instructions.json")
51
+ audio_instructions = json.load(open(audio_instruction_path))[self._subject]
52
+ for audio_file_name, instruction in tqdm(audio_instructions.items()):
53
+ local_audio_file_name = "_".join(audio_file_name.split("_")[:-1]) + ".mp3"
54
+ local_audio_path: str = os.path.join(audio_file_folder, local_audio_file_name)
55
+ content = [
56
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
57
+ MediaObject(content_type="text/plain", text=instruction),
58
+ ]
59
+ input = Input(multimedia_content=MultimediaObject(content))
60
+ references = [Reference(Output(text="unclear"), tags=[CORRECT_TAG])]
61
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
62
+ return instances
@@ -0,0 +1,59 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+ from tqdm import tqdm
14
+ from helm.common.media_object import MediaObject, MultimediaObject
15
+ from helm.common.general import ensure_file_downloaded
16
+ import pandas as pd
17
+
18
+
19
+ class AudioCapsScenario(Scenario):
20
+ """AudioCaps
21
+
22
+ AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
23
+ via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
24
+ musical instruments and genres, and common everyday environmental sounds.
25
+
26
+ Paper: https://aclanthology.org/N19-1011.pdf
27
+ Code: https://github.com/cdjkim/audiocaps
28
+
29
+ Citation:
30
+ @inproceedings{audiocaps,
31
+ title={AudioCaps: Generating Captions for Audios in The Wild},
32
+ author={Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee},
33
+ booktitle={NAACL-HLT},
34
+ year={2019}
35
+ }
36
+ """
37
+
38
+ DOWNLOADING_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/wav_files.zip"
39
+ REFERENCE_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/test.csv"
40
+
41
+ name = "audiocaps"
42
+ description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
43
+ ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
44
+ tags: List[str] = ["audio", "captioning"]
45
+
46
+ def get_instances(self, output_path: str) -> List[Instance]:
47
+ instances: List[Instance] = []
48
+ data_dir: str = os.path.join(output_path, "wav_files")
49
+ ensure_file_downloaded(source_url=AudioCapsScenario.DOWNLOADING_URL, target_path=data_dir, unpack=True)
50
+ for _, row in tqdm(pd.read_csv(AudioCapsScenario.REFERENCE_URL, sep=",").iterrows()):
51
+ audiocap_id = row["audiocap_id"]
52
+ audio_path: str = os.path.join(data_dir, f"{audiocap_id}.wav")
53
+ assert os.path.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
54
+ input = Input(
55
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path)])
56
+ )
57
+ references = [Reference(Output(text=str(row["caption"])), tags=[CORRECT_TAG])]
58
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
59
+ return instances
@@ -0,0 +1,152 @@
1
+ from typing import List, Optional
2
+ import os
3
+
4
+ from tqdm import tqdm
5
+ import json
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
18
+ from helm.common.audio_utils import extract_audio
19
+
20
+
21
+ class CasualConversations2Scenario(Scenario):
22
+ """
23
+ Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
24
+ The videos feature paid individuals who agreed to participate in the project and explicitly provided
25
+ Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
26
+ themselves. The videos were recorded in Brazil, India, Indonesia, Mexico, Philippines, United States,
27
+ and Vietnam with a diverse set of adults in various categories.
28
+
29
+ The dataset contains the audio, speaker's age, gender information in the following languages:
30
+ English, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Tamil, Telugu, and Vietnamese.
31
+
32
+ Paper: https://arxiv.org/abs/2303.04838
33
+ Dataset: https://ai.meta.com/datasets/casual-conversations-v2-dataset/
34
+
35
+ Requires downloading Causal Conversations V2 from https://ai.meta.com/datasets/casual-conversations-v2-downloads
36
+
37
+ Citation:
38
+ @inproceedings{porgali2023casual,
39
+ title={The casual conversations v2 dataset},
40
+ author={Porgali, Bilal and Albiero, V{\'\i}tor and Ryda, Jordan and Ferrer, Cristian Canton and Hazirbas, Caner},
41
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
42
+ pages={10--17},
43
+ year={2023}
44
+ }
45
+ """
46
+
47
+ SUBJECTS = ["age", "gender"]
48
+ SCRIPT_DOWNLOADING_URL = (
49
+ "https://huggingface.co/datasets/UCSC-VLAA/Causal_Conversation_V2_script/"
50
+ "resolve/main/CasualConversationsV2_v2.json"
51
+ )
52
+ AGE_INSTRUCTION = "Listen to the audio and take your best guess to estimate the speaker's age."
53
+ GENDER_INSTRUCTION = "Listen to the audio and take your best guess to determine the speaker's gender."
54
+ name = "casual_conversations2"
55
+ description = (
56
+ "A large scale multilingual speech translation corpus "
57
+ "([Porgali et al., 2023](https://arxiv.org/abs/2303.04838))."
58
+ )
59
+ tags = ["audio", "classification", "multilinguality"]
60
+ gender_options: List[str] = ["male", "female", "transgender male", "transgender female", "non-binary", "other"]
61
+ age_options: List[str] = ["18-30", "31-50", "51+", "other"]
62
+
63
+ def __init__(self, subject: str) -> None:
64
+ super().__init__()
65
+
66
+ if subject not in self.SUBJECTS:
67
+ raise ValueError(f"Invalid subject. Valid subjects are: {CasualConversations2Scenario.SUBJECTS}")
68
+
69
+ self._subject: str = subject
70
+ self._convert_answer_to_label_func = (
71
+ self._convert_age_to_label if subject == "age" else self._convert_gender_to_label
72
+ )
73
+ self.options = self.age_options if subject == "age" else self.gender_options
74
+ self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION
75
+
76
+ def _convert_age_to_label(self, age: str) -> str:
77
+ if age != "prefer not to say":
78
+ age_int = int(age)
79
+ if 18 <= age_int <= 30:
80
+ return "A"
81
+ elif 31 <= age_int <= 50:
82
+ return "B"
83
+ elif 51 <= age_int:
84
+ return "C"
85
+ else:
86
+ raise ValueError(f"Invalid age: {age}")
87
+ else:
88
+ return "D"
89
+
90
+ def _convert_gender_to_label(self, gender: Optional[str]) -> str:
91
+ if gender is not None and gender != "prefer not to say":
92
+ if gender == "cis man":
93
+ return "A"
94
+ elif gender == "cis woman":
95
+ return "B"
96
+ elif gender == "transgender man":
97
+ return "C"
98
+ elif gender == "transgender woman":
99
+ return "D"
100
+ elif gender == "non-binary":
101
+ return "E"
102
+ else:
103
+ raise ValueError(f"Invalid gender: {gender}")
104
+ else:
105
+ return "F"
106
+
107
+ def get_instances(self, output_path: str) -> List[Instance]:
108
+ data_dir: str = os.path.join(output_path, "videos_files")
109
+ assert os.path.exists(data_dir), (
110
+ f"Download the video files from Meta's Casual Conversations v2 dataset from "
111
+ f"(https://ai.meta.com/datasets/casual-conversations-v2-downloads) and unzip and place at {data_dir}."
112
+ )
113
+ script_file_path: str = os.path.join(output_path, "CasualConversationsV2.json")
114
+ audio_file_folder: str = os.path.join(output_path, "audio_files")
115
+ ensure_directory_exists(audio_file_folder)
116
+ ensure_file_downloaded(self.SCRIPT_DOWNLOADING_URL, script_file_path)
117
+ audio_scripts = json.load(open(script_file_path))
118
+
119
+ instances: List[Instance] = []
120
+ split: str = TEST_SPLIT
121
+
122
+ for file_name in tqdm(os.listdir(data_dir)):
123
+ if file_name.endswith(".mp4"):
124
+ local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
125
+ local_video_path: str = os.path.join(data_dir, file_name)
126
+
127
+ if not os.path.exists(local_audio_path):
128
+ extract_audio(local_video_path, local_audio_path)
129
+ assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"
130
+
131
+ subject_answer = audio_scripts[file_name][self._subject]
132
+ answer = self._convert_answer_to_label_func(subject_answer)
133
+ # The given correct answer is a letter, but we need an index
134
+ correct_answer_index: int = ord(answer) - ord("A")
135
+ # The options are originally appended to the question
136
+
137
+ references: List[Reference] = []
138
+ for i, option in enumerate(self.options):
139
+ reference: Reference
140
+ is_correct: bool = i == correct_answer_index
141
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
142
+ references.append(reference)
143
+
144
+ content = [
145
+ MediaObject(content_type="audio/mpeg", location=local_audio_path),
146
+ MediaObject(content_type="text/plain", text=self.instruction),
147
+ ]
148
+
149
+ input = Input(multimedia_content=MultimediaObject(content))
150
+ instances.append(Instance(input=input, references=references, split=split))
151
+
152
+ return instances
@@ -0,0 +1,99 @@
1
+ """Scenarios for audio models"""
2
+
3
+ from typing import List
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+ from collections import OrderedDict
15
+ from tqdm import tqdm
16
+ from datasets import load_dataset
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.hierarchical_logger import hlog
19
+
20
+
21
+ class CommonVoice15Scenario(Scenario):
22
+ """CommonVoice15 Scenario
23
+
24
+ The most recent release of CommonVoice15 (Ardila et al, 2019) includes 114 languages. Over 50,000
25
+ individuals have participated so far, resulting in 2,500 hours of collected audio. This is the largest
26
+ audio corpus in the public domain for speech recognition, both in terms of number of hours and number
27
+ of languages. The task is to recognize the speech from the audio sample.
28
+
29
+
30
+
31
+ Paper: https://arxiv.org/abs/1912.06670
32
+ Code: https://github.com/common-voice/common-voice
33
+
34
+ Citation:
35
+ @article{ardila2019common,
36
+ title={Common voice: A massively-multilingual speech corpus},
37
+ author={Ardila, Rosana and Branson, Megan and Davis, Kelly and
38
+ Henretty, Michael and Kohler, Michael and Meyer, Josh and Morais,
39
+ Reuben and Saunders, Lindsay and Tyers, Francis M and Weber, Gregor},
40
+ journal={arXiv preprint arXiv:1912.06670},
41
+ year={2019}
42
+ }
43
+
44
+ """
45
+
46
+ HF_DATASET_NAME = "mozilla-foundation/common_voice_15_0"
47
+
48
+ # Randomly selected 4 languages from 114 languages in the Common Voice 15 dataset following
49
+ # Qwen2-Audio (https://arxiv.org/abs/2407.10759). The full language is:
50
+ # https://huggingface.co/datasets/mozilla-foundation/common_voice_15_0/blob/main/languages.py
51
+ _COMMON_VOICE_TEST_LANG_TO_ID = OrderedDict(
52
+ [
53
+ ("English", "en"),
54
+ ("Chinese_hk", "zh-HK"),
55
+ ("German", "de"),
56
+ ("French", "fr"),
57
+ ]
58
+ )
59
+
60
+ name = "common_voice_15"
61
+ description = "Speech recognition for 4 languages from 114 different languages in Common Voice 15 \
62
+ ([Ardila et al, 2019](https://arxiv.org/abs/1912.06670))."
63
+ tags: List[str] = ["audio", "recognition", "multilinguality"]
64
+
65
+ def __init__(self, language: str) -> None:
66
+ super().__init__()
67
+
68
+ language = language.capitalize()
69
+ if language not in CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID.keys():
70
+ raise ValueError(
71
+ f"Invalid language. Valid languages are: {CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID.keys()}"
72
+ )
73
+
74
+ self._language: str = language
75
+ hlog(
76
+ "You need to sign in Huggingface to download the dataset. Please remember "
77
+ "to sign in to download the dataset."
78
+ )
79
+
80
+ def get_instances(self, output_path: str) -> List[Instance]:
81
+ instances: List[Instance] = []
82
+ language_category = CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID[self._language]
83
+ for row in tqdm(
84
+ load_dataset(
85
+ CommonVoice15Scenario.HF_DATASET_NAME,
86
+ name=language_category,
87
+ cache_dir=output_path,
88
+ split=TEST_SPLIT,
89
+ trust_remote_code=True,
90
+ )
91
+ ):
92
+ local_audio_path = row["path"]
93
+ answer = row["sentence"]
94
+ input = Input(
95
+ multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
96
+ )
97
+ references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
98
+ instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
99
+ return instances