crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -38,12 +38,14 @@ def _request_state(prediction: str, options: List[_Option]):
38
38
  )
39
39
 
40
40
 
41
- def assert_stats_equal(actual_stats: List[Stat], expected_values: Dict[str, float]):
42
- actual_values = {stat.name.name: stat.mean for stat in actual_stats}
43
- assert actual_values == approx(expected_values)
41
+ def get_stat_value(stats: List[Stat], stat_name: str):
42
+ for stat in stats:
43
+ if stat.name.name == stat_name:
44
+ return stat.mean
45
+ raise ValueError(f"No stat with name {stat_name}")
44
46
 
45
47
 
46
- def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
48
+ def compute_stats(all_classes_counts: Dict[str, Dict[str, int]]):
47
49
  micro_counts: Dict[str, int] = defaultdict(int)
48
50
  for class_counts in all_classes_counts.values():
49
51
  for key, class_count in class_counts.items():
@@ -52,102 +54,172 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
52
54
  micro_recall = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fn"])
53
55
  micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
54
56
 
55
- class_f1: List[float] = []
56
- for class_counts in all_classes_counts.values():
57
+ class_f1: Dict[str, float] = {}
58
+ for class_name, class_counts in all_classes_counts.items():
57
59
  class_precision = class_counts["tp"] / (class_counts["tp"] + class_counts["fp"])
58
60
  class_recall = class_counts["tp"] / (class_counts["tp"] + class_counts["fn"])
59
- class_f1.append(2 * (class_precision * class_recall) / (class_precision + class_recall))
60
- macro_f1 = sum(class_f1) / len(class_f1)
61
-
62
- return {
63
- "classification_micro_f1": micro_f1,
64
- "classification_macro_f1": macro_f1,
61
+ class_f1[class_name] = 2 * (class_precision * class_recall) / (class_precision + class_recall)
62
+ macro_f1 = sum(class_f1.values()) / len(class_f1)
63
+ class_name_to_support = {
64
+ class_name: class_counts["tp"] + class_counts["fn"] for class_name, class_counts in all_classes_counts.items()
65
65
  }
66
+ weighted_f1 = sum(class_f1[class_name] * support for class_name, support in class_name_to_support.items()) / sum(
67
+ support for support in class_name_to_support.values()
68
+ )
66
69
 
70
+ stats = {
71
+ "macro_f1": macro_f1,
72
+ "micro_f1": micro_f1,
73
+ "weighted_f1": weighted_f1,
74
+ }
75
+ for class_name, class_f1_score in class_f1.items():
76
+ stats[f"{class_name}_f1"] = class_f1_score
77
+ return stats
67
78
 
68
- def test_evaluate_instances_binary_generation():
69
- metric = ClassificationMetric(delimiter=None)
70
79
 
80
+ def test_evaluate_instances_default_parameters():
71
81
  request_states = [
72
82
  _request_state("yes", [_Option("yes", True)]),
83
+ _request_state("yes ", [_Option("yes", True)]),
84
+ _request_state("yeS", [_Option("yes", True)]),
85
+ _request_state("yes", [_Option("no", True)]),
86
+ _request_state("no", [_Option("yes", True)]),
87
+ _request_state("no", [_Option("no", True)]),
88
+ _request_state("invalid", [_Option("no", True)]),
89
+ ]
90
+
91
+ expected_stats = compute_stats(
92
+ {
93
+ "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
94
+ "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
95
+ }
96
+ )
97
+
98
+ actual_stats = ClassificationMetric().evaluate_instances(request_states, "")
99
+ actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
100
+ assert actual_macro_f1 == approx(expected_stats["macro_f1"])
101
+ actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
102
+ assert actual_micro_f1 == approx(expected_stats["micro_f1"])
103
+
104
+
105
+ def test_evaluate_instances_yes_and_no():
106
+ labels = ["yes", "no"]
107
+ request_states = [
73
108
  _request_state("yes", [_Option("yes", True)]),
74
- _request_state("yes", [_Option("yes", True)]),
109
+ _request_state("yes ", [_Option("yes", True)]),
110
+ _request_state("yeS", [_Option("yes", True)]),
75
111
  _request_state("yes", [_Option("no", True)]),
76
112
  _request_state("no", [_Option("yes", True)]),
77
113
  _request_state("no", [_Option("no", True)]),
78
114
  _request_state("invalid", [_Option("no", True)]),
79
115
  ]
80
116
 
81
- assert_stats_equal(
82
- metric.evaluate_instances(request_states, ""),
83
- _expected_stats(
84
- {
85
- "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
86
- "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
87
- }
88
- ),
117
+ expected_stats = compute_stats(
118
+ {
119
+ "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
120
+ "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
121
+ }
89
122
  )
90
123
 
124
+ actual_stats = ClassificationMetric(
125
+ scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels
126
+ ).evaluate_instances(request_states, "")
127
+ actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
128
+ assert actual_macro_f1 == approx(expected_stats["macro_f1"])
129
+ actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
130
+ assert actual_micro_f1 == approx(expected_stats["micro_f1"])
131
+ actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
132
+ assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
133
+ actual_yes_f1 = get_stat_value(actual_stats, "classification_yes_f1")
134
+ assert actual_yes_f1 == approx(expected_stats["yes_f1"])
135
+ actual_no_f1 = get_stat_value(actual_stats, "classification_no_f1")
136
+ assert actual_no_f1 == approx(expected_stats["no_f1"])
137
+
91
138
 
92
139
  def test_evaluate_instances_multi_class():
93
- # Note: no "a" because it would get filtered out by normalize_text()
94
- metric = ClassificationMetric(delimiter=None)
140
+ labels = ["a", "b", "c"]
95
141
 
96
- def _options(correct: str):
97
- return [_Option(text, text == correct) for text in ["d", "b", "c"]]
142
+ def _gold_label(correct: str):
143
+ return [_Option(text, text == correct) for text in labels]
98
144
 
99
145
  request_states = [
100
- _request_state("d", _options("d")),
101
- _request_state("d", _options("d")),
102
- _request_state("d", _options("d")),
103
- _request_state("d", _options("b")),
104
- _request_state("b", _options("b")),
105
- _request_state("b", _options("b")),
106
- _request_state("b", _options("c")),
107
- _request_state("c", _options("d")),
108
- _request_state("c", _options("c")),
109
- _request_state("invalid", _options("c")),
146
+ _request_state("a", _gold_label("a")),
147
+ _request_state("a", _gold_label("a")),
148
+ _request_state("a", _gold_label("a")),
149
+ _request_state("a", _gold_label("b")),
150
+ _request_state("b", _gold_label("b")),
151
+ _request_state("b", _gold_label("b")),
152
+ _request_state("b", _gold_label("c")),
153
+ _request_state("c", _gold_label("a")),
154
+ _request_state("c", _gold_label("c")),
155
+ _request_state("invalid", _gold_label("c")),
110
156
  ]
111
- assert_stats_equal(
112
- metric.evaluate_instances(request_states, ""),
113
- _expected_stats(
114
- {
115
- "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
116
- "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
117
- "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
118
- }
119
- ),
157
+
158
+ expected_stats = compute_stats(
159
+ {
160
+ "a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
161
+ "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
162
+ "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
163
+ }
120
164
  )
121
165
 
166
+ actual_stats = ClassificationMetric(
167
+ scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels
168
+ ).evaluate_instances(request_states, "")
169
+ actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
170
+ assert actual_macro_f1 == approx(expected_stats["macro_f1"])
171
+ actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
172
+ assert actual_micro_f1 == approx(expected_stats["micro_f1"])
173
+ actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
174
+ assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
175
+ actual_a_f1 = get_stat_value(actual_stats, "classification_a_f1")
176
+ assert actual_a_f1 == approx(expected_stats["a_f1"])
177
+ actual_b_f1 = get_stat_value(actual_stats, "classification_b_f1")
178
+ assert actual_b_f1 == approx(expected_stats["b_f1"])
179
+ actual_c_f1 = get_stat_value(actual_stats, "classification_c_f1")
180
+ assert actual_c_f1 == approx(expected_stats["c_f1"])
181
+
122
182
 
123
183
  def test_evaluate_instances_multilabel():
124
- # Note: no "a" because it would get filtered out by normalize_text()
125
- metric = ClassificationMetric(delimiter=",")
184
+ labels = ["a", "b", "c"]
126
185
 
127
- def _options(correct: List[str]):
128
- return [_Option(text, text in correct) for text in ["d", "b", "c"]]
186
+ def _gold_labels(correct: List[str]):
187
+ return [_Option(text, text in correct) for text in labels]
129
188
 
130
189
  request_states = [
131
- _request_state("d,b", _options(["d", "b"])),
132
- _request_state("d,b", _options(["d", "c"])),
133
- _request_state("d", _options(["d"])),
134
- _request_state("c", _options(["b"])),
135
- _request_state("b", _options(["b", "c"])),
136
- _request_state("d,b", _options(["c"])),
137
- _request_state("d,c", _options(["d"])),
138
- _request_state("d,b,c", _options(["d", "b", "c"])),
190
+ _request_state("a,b", _gold_labels(["a", "b"])),
191
+ _request_state("a,b", _gold_labels(["a", "c"])),
192
+ _request_state("a", _gold_labels(["a"])),
193
+ _request_state("c", _gold_labels(["b"])),
194
+ _request_state("b", _gold_labels(["b", "c"])),
195
+ _request_state("a,b", _gold_labels(["c"])),
196
+ _request_state("a,c", _gold_labels(["a"])),
197
+ _request_state("a,b,c", _gold_labels(["a", "b", "c"])),
139
198
  _request_state("", []),
140
199
  _request_state("n/a", []),
141
- _request_state("invalid", _options(["c"])),
200
+ _request_state("invalid", _gold_labels(["c"])),
142
201
  ]
143
202
 
144
- assert_stats_equal(
145
- metric.evaluate_instances(request_states, ""),
146
- _expected_stats(
147
- {
148
- "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
149
- "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
150
- "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
151
- }
152
- ),
203
+ expected_stats = compute_stats(
204
+ {
205
+ "a": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
206
+ "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
207
+ "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
208
+ }
153
209
  )
210
+
211
+ actual_stats = ClassificationMetric(
212
+ scores=["f1"], averages=["macro", "micro", "weighted", None], labels=labels, delimiter=","
213
+ ).evaluate_instances(request_states, "")
214
+ actual_macro_f1 = get_stat_value(actual_stats, "classification_macro_f1")
215
+ assert actual_macro_f1 == approx(expected_stats["macro_f1"])
216
+ actual_micro_f1 = get_stat_value(actual_stats, "classification_micro_f1")
217
+ assert actual_micro_f1 == approx(expected_stats["micro_f1"])
218
+ actual_weighted_f1 = get_stat_value(actual_stats, "classification_weighted_f1")
219
+ assert actual_weighted_f1 == approx(expected_stats["weighted_f1"])
220
+ actual_a_f1 = get_stat_value(actual_stats, "classification_a_f1")
221
+ assert actual_a_f1 == approx(expected_stats["a_f1"])
222
+ actual_b_f1 = get_stat_value(actual_stats, "classification_b_f1")
223
+ assert actual_b_f1 == approx(expected_stats["b_f1"])
224
+ actual_c_f1 = get_stat_value(actual_stats, "classification_c_f1")
225
+ assert actual_c_f1 == approx(expected_stats["c_f1"])
@@ -1,4 +1,7 @@
1
+ import pytest
1
2
  from helm.benchmark.metrics.evaluate_reference_metrics import (
3
+ bleu_1,
4
+ chinese_bleu_1,
2
5
  exact_match,
3
6
  exact_match_indicator,
4
7
  final_number_exact_match,
@@ -28,3 +31,15 @@ def test_final_number_exact_match():
28
31
  assert final_number_exact_match("34.2", "2") == 0
29
32
  assert final_number_exact_match("342", "342.") == 1
30
33
  assert final_number_exact_match("3,420", "3420") == 1
34
+
35
+
36
+ def test_bleu_1():
37
+ assert bleu_1("the quick brown fox jumped over the lazy dog", "the fox jumped over the dog") == pytest.approx(
38
+ 0.6065306597126334
39
+ )
40
+
41
+
42
+ def test_chinese_bleu_1():
43
+ assert chinese_bleu_1(
44
+ "太祖武皇帝,沛國譙人也,姓曹,諱操,字孟德,漢相國參之後。", "太祖武皇帝,沛國譙人也,漢相國參之後。"
45
+ ) == pytest.approx(0.5907775139012316)
@@ -1,5 +1,5 @@
1
1
  from helm.common.request import Token
2
- from .basic_metrics import get_num_bytes, convert_tokens_to_text
2
+ from helm.benchmark.metrics.basic_metrics import get_num_bytes, convert_tokens_to_text
3
3
 
4
4
 
5
5
  def test_get_num_bytes():
@@ -3,8 +3,8 @@ from typing import Dict
3
3
  import pytest
4
4
  import statistics
5
5
 
6
- from .metric_name import MetricName
7
- from .statistic import Stat, merge_stat
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.statistic import Stat, merge_stat
8
8
 
9
9
 
10
10
  def test_stat_add():
@@ -1,6 +1,6 @@
1
1
  from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
- from .token_cost_estimator import TokenCostEstimator
3
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
4
4
 
5
5
 
6
6
  class AI21TokenCostEstimator(TokenCostEstimator):
@@ -2,12 +2,12 @@ from typing import Dict
2
2
 
3
3
  from helm.benchmark.metrics.metric_service import MetricService
4
4
  from helm.common.request import Request
5
- from .ai21_token_cost_estimator import AI21TokenCostEstimator
6
- from .cohere_token_cost_estimator import CohereTokenCostEstimator
7
- from .free_token_cost_estimator import FreeTokenCostEstimator
8
- from .gooseai_token_cost_estimator import GooseAITokenCostEstimator
9
- from .openai_token_cost_estimator import OpenAITokenCostEstimator
10
- from .token_cost_estimator import TokenCostEstimator
5
+ from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
6
+ from helm.benchmark.metrics.tokens.cohere_token_cost_estimator import CohereTokenCostEstimator
7
+ from helm.benchmark.metrics.tokens.free_token_cost_estimator import FreeTokenCostEstimator
8
+ from helm.benchmark.metrics.tokens.gooseai_token_cost_estimator import GooseAITokenCostEstimator
9
+ from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
10
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
11
11
 
12
12
 
13
13
  class AutoTokenCostEstimator(TokenCostEstimator):
@@ -1,6 +1,6 @@
1
1
  from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
- from .token_cost_estimator import TokenCostEstimator
3
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
4
4
 
5
5
 
6
6
  class CohereTokenCostEstimator(TokenCostEstimator):
@@ -1,6 +1,6 @@
1
1
  from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
- from .token_cost_estimator import TokenCostEstimator
3
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
4
4
 
5
5
 
6
6
  class FreeTokenCostEstimator(TokenCostEstimator):
@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.benchmark.window_services.window_service import WindowService
3
3
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
4
4
  from helm.common.request import Request
5
- from .token_cost_estimator import TokenCostEstimator
5
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
6
6
 
7
7
 
8
8
  class GooseAITokenCostEstimator(TokenCostEstimator):
@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
3
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
4
4
  from helm.benchmark.window_services.window_service import WindowService
5
- from .token_cost_estimator import TokenCostEstimator
5
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
6
6
 
7
7
 
8
8
  class OpenAITokenCostEstimator(TokenCostEstimator):
@@ -1,7 +1,7 @@
1
1
  from unittest.mock import Mock
2
2
 
3
3
  from helm.common.request import Request
4
- from .ai21_token_cost_estimator import AI21TokenCostEstimator
4
+ from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
5
5
 
6
6
 
7
7
  class TestAI21TokenCostEstimator:
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
8
8
  from helm.common.request import Request
9
9
  from helm.common.tokenization_request import TokenizationRequestResult, TokenizationToken
10
10
  from helm.proxy.services.remote_service import RemoteService
11
- from .openai_token_cost_estimator import OpenAITokenCostEstimator
11
+ from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
12
12
 
13
13
 
14
14
  class TestOpenAITokenCostEstimator:
@@ -6,10 +6,10 @@ from helm.common.hierarchical_logger import hlog
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
8
  from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
9
- from .metric import Metric
10
- from .metric_name import MetricName
11
- from .metric_service import MetricService
12
- from .statistic import Stat
9
+ from helm.benchmark.metrics.metric import Metric
10
+ from helm.benchmark.metrics.metric_name import MetricName
11
+ from helm.benchmark.metrics.metric_service import MetricService
12
+ from helm.benchmark.metrics.statistic import Stat
13
13
 
14
14
 
15
15
  class ToxicityMetric(Metric):
@@ -1,9 +1,11 @@
1
+ import numbers
1
2
  import re
2
- from typing import Dict, List
3
+ from typing import Dict, List, Set
3
4
 
4
5
  from datasets import load_dataset
5
6
  import evaluate
6
7
 
8
+ from helm.common.general import hlog
7
9
  from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
8
10
  from helm.benchmark.adaptation.scenario_state import ScenarioState
9
11
  from helm.benchmark.metrics.metric_name import MetricName
@@ -16,7 +18,10 @@ class UnitxtMetric(MetricInterface):
16
18
 
17
19
  def __init__(self, **kwargs):
18
20
  super().__init__()
19
- dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
21
+ if len(kwargs) == 1 and "recipe" in kwargs:
22
+ dataset_name = kwargs["recipe"]
23
+ else:
24
+ dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
20
25
  self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
21
26
 
22
27
  def evaluate(
@@ -42,6 +47,7 @@ class UnitxtMetric(MetricInterface):
42
47
  )
43
48
 
44
49
  # Extract instance metrics
50
+ non_number_instance_metric_names: Set[str] = set()
45
51
  per_instance_stats: List[PerInstanceStats] = []
46
52
  for request_state, evaluate_result in zip(scenario_state.request_states, evaluate_results):
47
53
  instance = request_state.instance
@@ -60,9 +66,15 @@ class UnitxtMetric(MetricInterface):
60
66
  )
61
67
  if isinstance(metric_score, list):
62
68
  for metric_score_element in metric_score:
63
- stat = stat.add(metric_score_element)
69
+ if isinstance(metric_score_element, numbers.Number):
70
+ stat = stat.add(metric_score_element)
71
+ else:
72
+ non_number_instance_metric_names.add(metric_name)
64
73
  else:
65
- stat = stat.add(metric_score)
74
+ if isinstance(metric_score, numbers.Number):
75
+ stat = stat.add(metric_score)
76
+ else:
77
+ non_number_instance_metric_names.add(metric_name)
66
78
  instance_stats.append(stat)
67
79
  assert instance.id
68
80
  per_instance_stats.append(
@@ -73,6 +85,11 @@ class UnitxtMetric(MetricInterface):
73
85
  stats=instance_stats,
74
86
  )
75
87
  )
88
+ if non_number_instance_metric_names:
89
+ hlog(
90
+ "WARNING: Ignored Unitxt instance metrics because "
91
+ f"they were not numbers: {non_number_instance_metric_names}"
92
+ )
76
93
 
77
94
  # Extract global metrics
78
95
  aggregated_stats: List[Stat] = []
@@ -303,7 +303,10 @@ class AnnotatedImageMetrics(Metric):
303
303
  if self._lpips_metric is None:
304
304
  with warnings.catch_warnings():
305
305
  warnings.filterwarnings("ignore", category=UserWarning)
306
- self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg").to(self._device)
306
+ # https://lightning.ai/docs/torchmetrics/stable/image/learned_perceptual_image_patch_similarity.html
307
+ self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg", normalize=True).to(
308
+ self._device
309
+ )
307
310
 
308
311
  preprocessing = transforms.Compose(
309
312
  [
@@ -321,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
321
324
 
322
325
  # Compute the LPIPS score
323
326
  assert self._lpips_metric is not None
324
- score: float = self._lpips_metric(img1, img2).detach().item()
327
+ score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
325
328
  return score
326
329
 
327
330
  def _calculate_fid(self, act1, act2):
@@ -400,7 +403,8 @@ class AnnotatedImageMetrics(Metric):
400
403
 
401
404
  def compute_ssim(self, generated_image: np.ndarray, reference_image: np.ndarray) -> float:
402
405
  """Compute the Structural Similarity Index (SSIM) between the generated and reference images."""
403
- return ssim(generated_image, reference_image)
406
+ # Add 1 and divide by 2 to get a normalized score between 0 and 1, where 1 is the most similar
407
+ return (ssim(generated_image, reference_image) + 1) / 2
404
408
 
405
409
  def compute_edit_sim(self, completion: str, reference: str) -> float:
406
410
  # `reference` is the entire remaining book for each instance.
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class WildBenchScoreMetric(Metric):
12
+ """Score metrics for WildBench."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ annotations: Dict[str, Any] = request_state.annotations["wildbench"]
23
+ scores: List[float] = []
24
+ for annotation_key, annotation_value in annotations.items():
25
+ if annotation_key.endswith("_score") and annotation_value is not None:
26
+ scores.append(annotation_value)
27
+ if not scores:
28
+ raise ValueError("Could not compute WB Score because all annotators failed.")
29
+ score = sum(scores) / len(scores)
30
+ score_rescaled = (score - 1) / 9
31
+ return [
32
+ Stat(MetricName("wildbench_score")).add(score),
33
+ Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
34
+ ]
@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
22
22
  # OpenAI Chat format
23
23
  OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
24
24
 
25
+ # For NOVA models
26
+ NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
27
+
25
28
  # For Anthropic models
26
29
  ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
27
30
  ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
66
69
  LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
67
70
  FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
68
71
 
72
+ # For Audio-langauge models (AudioLMs)
73
+ AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
74
+
69
75
  # Deprecated models that are no longer available.
70
76
  # These are usually closed API models that have been permanently removed
71
77
  DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
72
78
 
79
+ # Unsupported models.
80
+ # These are models that we have chosen not to support because they are
81
+ # private, stale, non-notable, or difficult to implement.
82
+ UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
83
+
73
84
  # Frozen is set to false as the model_deployment_registry.py file
74
85
  # might populate the deployment_names field.
75
86
 
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
208
219
  return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
209
220
 
210
221
 
222
+ def is_audiolm(model_name: str) -> bool:
223
+ """Returns True if the model is a audio-language model (AudioLM). False otherwise."""
224
+ return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
225
+
226
+
211
227
  def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
212
228
  """Return placeholder ModelMetadata for an unknown model."""
213
229
  return ModelMetadata(
@@ -14,7 +14,6 @@ from helm.benchmark.config_registry import register_builtin_configs_from_helm_pa
14
14
  from helm.common.hierarchical_logger import hlog
15
15
  from helm.common.optional_dependencies import handle_module_not_found_error
16
16
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
17
- from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
18
17
 
19
18
  try:
20
19
  import colorcet
@@ -39,6 +38,7 @@ metric_group_to_label = {
39
38
  "Efficiency": f"Inference time (s) {DOWN_ARROW}",
40
39
  }
41
40
  all_metric_groups = list(metric_group_to_label.keys())
41
+ AGGREGATE_WIN_RATE_COLUMN = 1
42
42
 
43
43
 
44
44
  @dataclass
@@ -119,6 +119,9 @@ class MetricGroup(Field):
119
119
  hide_win_rates: Optional[bool] = None
120
120
  """If set to true, do not compute win rates."""
121
121
 
122
+ aggregation_strategies: Optional[List[str]] = None
123
+ """List with values in {'win_rate','mean'} that correspond to aggregations"""
124
+
122
125
 
123
126
  BY_METRIC = "by_metric"
124
127
  BY_GROUP = "by_group"