crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ """Run spec functions for three clinical sections of MMLU human-translated into 11 African languages
2
+
3
+ Available subjects: "clinical_knowledge", "college_medicine", "virology"
4
+ Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages)
5
+ """ # noqa: E501
6
+
7
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
8
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
9
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
10
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
11
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
12
+
13
+
14
+ @run_spec_function("mmlu_clinical_afr")
15
+ def get_mmlu_clinical_afr_spec(subject: str, lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
16
+ scenario_spec = ScenarioSpec(
17
+ class_name="helm.benchmark.scenarios.mmlu_clinical_afr_scenario.MMLU_Clinical_Afr_Scenario",
18
+ args={"subject": subject, "lang": lang},
19
+ )
20
+
21
+ lang_map = {
22
+ "af": "Afrikaans",
23
+ "zu": "Zulu",
24
+ "xh": "Xhosa",
25
+ "am": "Amharic",
26
+ "bm": "Bambara",
27
+ "ig": "Igbo",
28
+ "nso": "Sepedi",
29
+ "sn": "Shona",
30
+ "st": "Sesotho",
31
+ "tn": "Setswana",
32
+ "ts": "Tsonga",
33
+ }
34
+
35
+ adapter_spec = get_multiple_choice_adapter_spec(
36
+ method=method,
37
+ instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')} "
38
+ f"in {lang_map[lang]}.",
39
+ input_noun="Question",
40
+ output_noun="Answer",
41
+ )
42
+
43
+ return RunSpec(
44
+ name=f"mmlu_clinical_afr:subject={subject},lang={lang},method={method}",
45
+ scenario_spec=scenario_spec,
46
+ adapter_spec=adapter_spec,
47
+ metric_specs=get_exact_match_metric_specs(),
48
+ groups=[f"mmlu_clinical_afr_{subject}", f"mmlu_clinical_afr_{subject}_{lang}"],
49
+ )
@@ -0,0 +1,32 @@
1
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
2
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
3
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("oab_exams")
9
+ def get_enem_spec() -> RunSpec:
10
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.oab_exams_scenario.OABExamsScenario", args={})
11
+
12
+ adapter_spec = get_multiple_choice_adapter_spec(
13
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
14
+ instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
15
+ "Se as opções forem A, B, C e D,"
16
+ "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
17
+ "Exemplo: Ao conselho da subseção compete\nA. representar a OAB no Conselho de Segurança do MERCOSUL."
18
+ "\nB. fiscalizar as funções e atribuições do conselho seccional.\nC. instaurar e instruir processos "
19
+ "disciplinares, para julgamento pelo Conselho Federal.\nD. receber pedido de inscrição nos quadros de "
20
+ "advogado e estagiário, instruindo e emitindo parecer prévio, para decisão do conselho seccional.\n"
21
+ "Resposta: D",
22
+ input_noun="Pergunta",
23
+ output_noun="Resposta",
24
+ )
25
+
26
+ return RunSpec(
27
+ name="oab_exams",
28
+ scenario_spec=scenario_spec,
29
+ adapter_spec=adapter_spec,
30
+ metric_specs=get_exact_match_metric_specs(),
31
+ groups=["oab_exams"],
32
+ )
@@ -41,6 +41,43 @@ def get_harm_bench_spec() -> RunSpec:
41
41
  )
42
42
 
43
43
 
44
+ @run_spec_function("harm_bench_gcg_transfer")
45
+ def get_harm_bench_gcg_transfer_spec() -> RunSpec:
46
+ adapter_spec = AdapterSpec(
47
+ method=ADAPT_GENERATION,
48
+ global_prefix="",
49
+ global_suffix="",
50
+ instructions="",
51
+ input_prefix="",
52
+ input_suffix="",
53
+ output_prefix="",
54
+ output_suffix="",
55
+ instance_prefix="",
56
+ max_train_instances=0,
57
+ num_outputs=1,
58
+ max_tokens=512,
59
+ temperature=0.0,
60
+ stop_sequences=[],
61
+ )
62
+ scenario_spec = ScenarioSpec(
63
+ class_name="helm.benchmark.scenarios.harm_bench_gcg_transfer_scenario.HarmBenchGCGTransferScenario"
64
+ )
65
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
66
+ metric_specs = [
67
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
68
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
69
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
70
+ ]
71
+ return RunSpec(
72
+ name="harm_bench_gcg_transfer",
73
+ scenario_spec=scenario_spec,
74
+ adapter_spec=adapter_spec,
75
+ metric_specs=metric_specs,
76
+ annotators=annotator_specs,
77
+ groups=["harm_bench_gcg_transfer"],
78
+ )
79
+
80
+
44
81
  @run_spec_function("simple_safety_tests")
45
82
  def get_simple_safety_tests_spec() -> RunSpec:
46
83
  adapter_spec = AdapterSpec(
@@ -5,9 +5,9 @@ from helm.benchmark.adaptation.common_adapter_specs import (
5
5
  get_generation_adapter_spec,
6
6
  get_multiple_choice_separate_adapter_spec,
7
7
  )
8
- from helm.benchmark.metrics.bhasa_metrics_specs import (
9
- get_bhasa_machine_translation_metric_specs,
10
- get_bhasa_qa_metric_specs,
8
+ from helm.benchmark.metrics.seahelm_metrics_specs import (
9
+ get_seahelm_machine_translation_metric_specs,
10
+ get_seahelm_qa_metric_specs,
11
11
  )
12
12
  from helm.benchmark.metrics.common_metric_specs import (
13
13
  get_basic_metric_specs,
@@ -17,7 +17,7 @@ from helm.benchmark.metrics.common_metric_specs import (
17
17
  from helm.benchmark.run_spec import RunSpec, run_spec_function
18
18
  from helm.benchmark.scenarios.scenario import ScenarioSpec
19
19
 
20
- # BHASA Run Specs
20
+ # SEA-HELM Run Specs
21
21
  # A. Natural Language Understanding
22
22
  # B. Natural Language Generation
23
23
  # C. Natural Language Reasoning
@@ -43,18 +43,18 @@ def get_tydiqa_spec() -> RunSpec:
43
43
  max_tokens=256,
44
44
  )
45
45
 
46
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.TyDiQAScenario")
46
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.TyDiQAScenario")
47
47
 
48
48
  return RunSpec(
49
49
  name=name,
50
50
  scenario_spec=scenario_spec,
51
51
  adapter_spec=adapter_spec,
52
- metric_specs=get_bhasa_qa_metric_specs(
52
+ metric_specs=get_seahelm_qa_metric_specs(
53
53
  args={
54
54
  "language": "id",
55
55
  }
56
56
  ),
57
- groups=["bhasa_nlu", "tydiqa"],
57
+ groups=["seahelm_nlu", "tydiqa"],
58
58
  )
59
59
 
60
60
 
@@ -84,7 +84,7 @@ def get_xquad_spec(language="th") -> RunSpec:
84
84
  )
85
85
 
86
86
  scenario_spec = ScenarioSpec(
87
- class_name="helm.benchmark.scenarios.bhasa_scenario.XQuADScenario",
87
+ class_name="helm.benchmark.scenarios.seahelm_scenario.XQuADScenario",
88
88
  args={
89
89
  "language": language,
90
90
  },
@@ -94,12 +94,12 @@ def get_xquad_spec(language="th") -> RunSpec:
94
94
  name=name,
95
95
  scenario_spec=scenario_spec,
96
96
  adapter_spec=adapter_spec,
97
- metric_specs=get_bhasa_qa_metric_specs(
97
+ metric_specs=get_seahelm_qa_metric_specs(
98
98
  args={
99
99
  "language": language,
100
100
  }
101
101
  ),
102
- groups=["bhasa_nlu", f"xquad_{language}"],
102
+ groups=["seahelm_nlu", f"xquad_{language}"],
103
103
  )
104
104
 
105
105
 
@@ -116,18 +116,18 @@ def get_indicqa_spec() -> RunSpec:
116
116
  max_tokens=256,
117
117
  )
118
118
 
119
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicQAScenario")
119
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicQAScenario")
120
120
 
121
121
  return RunSpec(
122
122
  name=name,
123
123
  scenario_spec=scenario_spec,
124
124
  adapter_spec=adapter_spec,
125
- metric_specs=get_bhasa_qa_metric_specs(
125
+ metric_specs=get_seahelm_qa_metric_specs(
126
126
  args={
127
127
  "language": "ta",
128
128
  }
129
129
  ),
130
- groups=["bhasa_nlu", "indicqa"],
130
+ groups=["seahelm_nlu", "indicqa"],
131
131
  )
132
132
 
133
133
 
@@ -146,14 +146,14 @@ def get_nusax_spec() -> RunSpec:
146
146
  max_tokens=16,
147
147
  )
148
148
 
149
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.NusaXScenario")
149
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.NusaXScenario")
150
150
 
151
151
  return RunSpec(
152
152
  name=name,
153
153
  scenario_spec=scenario_spec,
154
154
  adapter_spec=adapter_spec,
155
155
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
156
- groups=["bhasa_nlu", "nusax"],
156
+ groups=["seahelm_nlu", "nusax"],
157
157
  )
158
158
 
159
159
 
@@ -171,14 +171,14 @@ def get_uitvsfc_spec() -> RunSpec:
171
171
  max_tokens=16,
172
172
  )
173
173
 
174
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.UITVSFCScenario")
174
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.UITVSFCScenario")
175
175
 
176
176
  return RunSpec(
177
177
  name=name,
178
178
  scenario_spec=scenario_spec,
179
179
  adapter_spec=adapter_spec,
180
180
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
181
- groups=["bhasa_nlu", "uitvsfc"],
181
+ groups=["seahelm_nlu", "uitvsfc"],
182
182
  )
183
183
 
184
184
 
@@ -196,14 +196,14 @@ def get_wisesight_spec() -> RunSpec:
196
196
  max_tokens=16,
197
197
  )
198
198
 
199
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.WisesightScenario")
199
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.WisesightScenario")
200
200
 
201
201
  return RunSpec(
202
202
  name=name,
203
203
  scenario_spec=scenario_spec,
204
204
  adapter_spec=adapter_spec,
205
205
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
206
- groups=["bhasa_nlu", "wisesight"],
206
+ groups=["seahelm_nlu", "wisesight"],
207
207
  )
208
208
 
209
209
 
@@ -221,14 +221,14 @@ def get_indicsentiment_spec() -> RunSpec:
221
221
  max_tokens=16,
222
222
  )
223
223
 
224
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicSentimentScenario")
224
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicSentimentScenario")
225
225
 
226
226
  return RunSpec(
227
227
  name=name,
228
228
  scenario_spec=scenario_spec,
229
229
  adapter_spec=adapter_spec,
230
230
  metric_specs=get_classification_metric_specs() + get_basic_metric_specs([]),
231
- groups=["bhasa_nlu", "indicsentiment"],
231
+ groups=["seahelm_nlu", "indicsentiment"],
232
232
  )
233
233
 
234
234
 
@@ -250,14 +250,14 @@ def get_mlhsd_spec() -> RunSpec:
250
250
  max_tokens=16,
251
251
  )
252
252
 
253
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.MLHSDScenario")
253
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.MLHSDScenario")
254
254
 
255
255
  return RunSpec(
256
256
  name=name,
257
257
  scenario_spec=scenario_spec,
258
258
  adapter_spec=adapter_spec,
259
259
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
260
- groups=["bhasa_nlu", "mlhsd"],
260
+ groups=["seahelm_nlu", "mlhsd"],
261
261
  )
262
262
 
263
263
 
@@ -278,14 +278,14 @@ def get_vihsd_spec() -> RunSpec:
278
278
  max_tokens=16,
279
279
  )
280
280
 
281
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ViHSDScenario")
281
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.ViHSDScenario")
282
282
 
283
283
  return RunSpec(
284
284
  name=name,
285
285
  scenario_spec=scenario_spec,
286
286
  adapter_spec=adapter_spec,
287
287
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
288
- groups=["bhasa_nlu", "vihsd"],
288
+ groups=["seahelm_nlu", "vihsd"],
289
289
  )
290
290
 
291
291
 
@@ -304,14 +304,14 @@ def get_thaitoxicitytweets_spec() -> RunSpec:
304
304
  max_tokens=16,
305
305
  )
306
306
 
307
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ThaiToxicityTweetsScenario")
307
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.ThaiToxicityTweetsScenario")
308
308
 
309
309
  return RunSpec(
310
310
  name=name,
311
311
  scenario_spec=scenario_spec,
312
312
  adapter_spec=adapter_spec,
313
313
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
314
- groups=["bhasa_nlu", "thaitoxicitytweets"],
314
+ groups=["seahelm_nlu", "thaitoxicitytweets"],
315
315
  )
316
316
 
317
317
 
@@ -378,7 +378,7 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
378
378
  )
379
379
 
380
380
  scenario_spec = ScenarioSpec(
381
- class_name="helm.benchmark.scenarios.bhasa_scenario.FloresScenario",
381
+ class_name="helm.benchmark.scenarios.seahelm_scenario.FloresScenario",
382
382
  args={
383
383
  "pair": pair,
384
384
  },
@@ -388,8 +388,8 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
388
388
  name=name,
389
389
  scenario_spec=scenario_spec,
390
390
  adapter_spec=adapter_spec,
391
- metric_specs=get_bhasa_machine_translation_metric_specs(),
392
- groups=["bhasa_nlg", f"flores_{pair}"],
391
+ metric_specs=get_seahelm_machine_translation_metric_specs(),
392
+ groups=["seahelm_nlg", f"flores_{pair}"],
393
393
  )
394
394
 
395
395
 
@@ -414,14 +414,14 @@ def get_indonli_spec() -> RunSpec:
414
414
  max_tokens=2,
415
415
  )
416
416
 
417
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndoNLIScenario")
417
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndoNLIScenario")
418
418
 
419
419
  return RunSpec(
420
420
  name=name,
421
421
  scenario_spec=scenario_spec,
422
422
  adapter_spec=adapter_spec,
423
423
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
424
- groups=["bhasa_nlr", "indonli"],
424
+ groups=["seahelm_nlr", "indonli"],
425
425
  )
426
426
 
427
427
 
@@ -456,7 +456,7 @@ def get_xnli_spec(language="vi") -> RunSpec:
456
456
  )
457
457
 
458
458
  scenario_spec = ScenarioSpec(
459
- class_name="helm.benchmark.scenarios.bhasa_scenario.XNLIScenario",
459
+ class_name="helm.benchmark.scenarios.seahelm_scenario.XNLIScenario",
460
460
  args={
461
461
  "language": language,
462
462
  },
@@ -467,7 +467,7 @@ def get_xnli_spec(language="vi") -> RunSpec:
467
467
  scenario_spec=scenario_spec,
468
468
  adapter_spec=adapter_spec,
469
469
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
470
- groups=["bhasa_nlr", f"xnli_{language}"],
470
+ groups=["seahelm_nlr", f"xnli_{language}"],
471
471
  )
472
472
 
473
473
 
@@ -487,14 +487,14 @@ def get_indicxnli_spec() -> RunSpec:
487
487
  max_tokens=2,
488
488
  )
489
489
 
490
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicXNLIScenario")
490
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicXNLIScenario")
491
491
 
492
492
  return RunSpec(
493
493
  name=name,
494
494
  scenario_spec=scenario_spec,
495
495
  adapter_spec=adapter_spec,
496
496
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
497
- groups=["bhasa_nlr", "indicxnli"],
497
+ groups=["seahelm_nlr", "indicxnli"],
498
498
  )
499
499
 
500
500
 
@@ -531,7 +531,7 @@ def get_xcopa_spec(language="id") -> RunSpec:
531
531
  )
532
532
 
533
533
  scenario_spec = ScenarioSpec(
534
- class_name="helm.benchmark.scenarios.bhasa_scenario.XCOPAScenario",
534
+ class_name="helm.benchmark.scenarios.seahelm_scenario.XCOPAScenario",
535
535
  args={
536
536
  "language": language,
537
537
  },
@@ -542,7 +542,7 @@ def get_xcopa_spec(language="id") -> RunSpec:
542
542
  scenario_spec=scenario_spec,
543
543
  adapter_spec=adapter_spec,
544
544
  metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
545
- groups=["bhasa_nlr", f"xcopa_{language}"],
545
+ groups=["seahelm_nlr", f"xcopa_{language}"],
546
546
  )
547
547
 
548
548
 
@@ -566,7 +566,7 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
566
566
  )
567
567
 
568
568
  scenario_spec = ScenarioSpec(
569
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
569
+ class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEASyntaxMinimalPairsScenario",
570
570
  args={
571
571
  "method": method,
572
572
  "language": language,
@@ -579,7 +579,7 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
579
579
  adapter_spec=adapter_spec,
580
580
  metric_specs=get_exact_match_metric_specs(),
581
581
  groups=[
582
- "bhasa_linguistic",
582
+ "seahelm_linguistic",
583
583
  f"lindsea_syntax_minimal_pairs_{language}",
584
584
  f"lindsea_syntax_minimal_pairs_{method}_{language}",
585
585
  ],
@@ -599,7 +599,7 @@ def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: st
599
599
  )
600
600
 
601
601
  scenario_spec = ScenarioSpec(
602
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPresuppositionsScenario",
602
+ class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEAPragmaticsPresuppositionsScenario",
603
603
  args={
604
604
  "language": language,
605
605
  "subset": subset,
@@ -612,7 +612,7 @@ def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: st
612
612
  adapter_spec=adapter_spec,
613
613
  metric_specs=get_exact_match_metric_specs(),
614
614
  groups=[
615
- "bhasa_linguistic",
615
+ "seahelm_linguistic",
616
616
  f"lindsea_pragmatics_presuppositions_{language}",
617
617
  f"lindsea_pragmatics_presuppositions_{subset}_{language}",
618
618
  ],
@@ -632,7 +632,7 @@ def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset
632
632
  )
633
633
 
634
634
  scenario_spec = ScenarioSpec(
635
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
635
+ class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
636
636
  args={
637
637
  "language": language,
638
638
  "subset": subset,
@@ -645,7 +645,7 @@ def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset
645
645
  adapter_spec=adapter_spec,
646
646
  metric_specs=get_exact_match_metric_specs(),
647
647
  groups=[
648
- "bhasa_linguistic",
648
+ "seahelm_linguistic",
649
649
  f"lindsea_pragmatics_scalar_implicatures_{language}",
650
650
  f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}",
651
651
  ],
@@ -0,0 +1,169 @@
1
+ from typing import List, Optional
2
+ from helm.benchmark.adaptation.adapter_spec import (
3
+ ADAPT_GENERATION_MULTIMODAL,
4
+ ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
5
+ AdapterSpec,
6
+ )
7
+ from helm.benchmark.metrics.common_metric_specs import (
8
+ get_basic_generation_metric_specs,
9
+ get_basic_metric_specs,
10
+ get_multiple_choice_classification_metric_specs,
11
+ )
12
+ from helm.benchmark.metrics.metric import MetricSpec
13
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
14
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
15
+
16
+
17
+ def audio_classification_metric_specs() -> List[MetricSpec]:
18
+ return get_multiple_choice_classification_metric_specs() + get_basic_metric_specs(
19
+ ["exact_match", "quasi_exact_match"]
20
+ )
21
+
22
+
23
+ def _get_multiple_choice_joint_adapter_spec(
24
+ input_noun: Optional[str],
25
+ output_noun: str,
26
+ max_train_instances: int = 0,
27
+ num_outputs: int = 1,
28
+ ) -> AdapterSpec:
29
+ return AdapterSpec(
30
+ method=ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
31
+ global_prefix="",
32
+ instructions="Answer the multiple choice question by just giving the letter of the correct answer "
33
+ "and nothing else.",
34
+ input_prefix=f"{input_noun}: " if input_noun is not None else "",
35
+ input_suffix="\n",
36
+ output_prefix=f"{output_noun}: ",
37
+ output_suffix="\n",
38
+ instance_prefix="\n",
39
+ max_train_instances=max_train_instances,
40
+ num_outputs=num_outputs,
41
+ max_tokens=1,
42
+ stop_sequences=["\n"],
43
+ temperature=0.0,
44
+ random=None,
45
+ )
46
+
47
+
48
+ def _get_generation_adapter_spec(
49
+ max_tokens: int,
50
+ instructions: str = "",
51
+ max_train_instances: int = 0,
52
+ temperature: float = 0.0,
53
+ stop_sequences: Optional[List[str]] = None,
54
+ ) -> AdapterSpec:
55
+ return AdapterSpec(
56
+ method=ADAPT_GENERATION_MULTIMODAL,
57
+ instructions=instructions,
58
+ input_prefix="",
59
+ input_suffix="",
60
+ output_prefix="",
61
+ output_suffix="",
62
+ instance_prefix="",
63
+ max_train_instances=max_train_instances,
64
+ num_outputs=1,
65
+ max_tokens=max_tokens,
66
+ temperature=temperature,
67
+ stop_sequences=stop_sequences if stop_sequences is not None else [],
68
+ )
69
+
70
+
71
+ @run_spec_function("ultra_suite_classification")
72
+ def get_ultra_suite_classification_run_spec() -> RunSpec:
73
+ scenario_spec = ScenarioSpec(
74
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_classification_scenario.UltraSuiteClassificationScenario", # noqa: E501
75
+ )
76
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
77
+ input_noun=None, output_noun="Answer", max_train_instances=0
78
+ )
79
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
80
+ run_spec_name: str = "ultra_suite_classification"
81
+ return RunSpec(
82
+ name=f"{run_spec_name}",
83
+ scenario_spec=scenario_spec,
84
+ adapter_spec=adapter_spec,
85
+ metric_specs=metric_specs,
86
+ groups=[run_spec_name],
87
+ )
88
+
89
+
90
+ @run_spec_function("ultra_suite_classification_breakdown")
91
+ def get_ultra_suite_disorder_breakdown_run_spec() -> RunSpec:
92
+ scenario_spec = ScenarioSpec(
93
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_breakdown_scenario.UltraSuiteDisorderBreakdownScenario", # noqa: E501
94
+ )
95
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
96
+ input_noun=None, output_noun="Answer", max_train_instances=0
97
+ )
98
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
99
+ run_spec_name: str = "ultra_suite_classification_breakdown"
100
+ return RunSpec(
101
+ name=f"{run_spec_name}",
102
+ scenario_spec=scenario_spec,
103
+ adapter_spec=adapter_spec,
104
+ metric_specs=metric_specs,
105
+ groups=[run_spec_name],
106
+ )
107
+
108
+
109
+ # Makes the model transcribe the child's speech into text without assuming what the child is supposed to say
110
+ # if the transcription matches the prompt, then it is classified as typically developing
111
+ # otherwise, it is classified as having a speech disorder
112
+ @run_spec_function("ultra_suite_asr_classification")
113
+ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
114
+ scenario_spec = ScenarioSpec(
115
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_classification.UltraSuiteASRClassificationScenario", # noqa: E501
116
+ )
117
+ adapter_spec = _get_generation_adapter_spec(
118
+ instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
119
+ max_tokens=10,
120
+ )
121
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
122
+ run_spec_name: str = "ultra_suite_asr_classification"
123
+ return RunSpec(
124
+ name=run_spec_name,
125
+ scenario_spec=scenario_spec,
126
+ adapter_spec=adapter_spec,
127
+ metric_specs=metric_specs,
128
+ groups=[run_spec_name],
129
+ )
130
+
131
+
132
+ # Makes the model transcribe the child's speech into text and is allowed to assume what the child is supposed to say
133
+ @run_spec_function("ultra_suite_asr_transcription")
134
+ def get_ultra_suite_asr_transcription_run_spec() -> RunSpec:
135
+ scenario_spec = ScenarioSpec(
136
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_classification.UltraSuiteASRClassificationScenario", # noqa: E501
137
+ )
138
+ adapter_spec = _get_generation_adapter_spec(
139
+ instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Try to understand what the child is expected to say. And only respond with the transcription of the child's speech. Not the pathologist's prompt or any other commentary. Only respond with the text transcription, no other text, commentary or punctuations.""", # noqa: E501
140
+ max_tokens=50,
141
+ )
142
+ metric_specs: List[MetricSpec] = get_basic_generation_metric_specs(["wer_score", "mer_score", "wip_score"])
143
+ run_spec_name: str = "ultra_suite_asr_transcription"
144
+ return RunSpec(
145
+ name=run_spec_name,
146
+ scenario_spec=scenario_spec,
147
+ adapter_spec=adapter_spec,
148
+ metric_specs=metric_specs,
149
+ groups=[run_spec_name],
150
+ )
151
+
152
+
153
+ @run_spec_function("ultra_suite_disorder_symptoms")
154
+ def get_ultra_suite_disorder_symptoms_run_spec() -> RunSpec:
155
+ scenario_spec = ScenarioSpec(
156
+ class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_symptoms_scenario.UltraSuiteDisorderSymptomsScenario", # noqa: E501
157
+ )
158
+ adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
159
+ input_noun=None, output_noun="Answer", max_train_instances=0
160
+ )
161
+ metric_specs: List[MetricSpec] = audio_classification_metric_specs()
162
+ run_spec_name: str = "ultra_suite_disorder_symptoms"
163
+ return RunSpec(
164
+ name=f"{run_spec_name}",
165
+ scenario_spec=scenario_spec,
166
+ adapter_spec=adapter_spec,
167
+ metric_specs=metric_specs,
168
+ groups=[run_spec_name],
169
+ )