crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -5,12 +5,12 @@ from typing import Dict, List, Set
5
5
  from datasets import load_dataset
6
6
  import evaluate
7
7
 
8
- from helm.common.general import hlog
9
8
  from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
10
9
  from helm.benchmark.adaptation.scenario_state import ScenarioState
11
10
  from helm.benchmark.metrics.metric_name import MetricName
12
11
  from helm.benchmark.metrics.metric_service import MetricService
13
12
  from helm.benchmark.metrics.statistic import Stat
13
+ from helm.common.hierarchical_logger import hwarn
14
14
 
15
15
 
16
16
  class UnitxtMetric(MetricInterface):
@@ -18,7 +18,10 @@ class UnitxtMetric(MetricInterface):
18
18
 
19
19
  def __init__(self, **kwargs):
20
20
  super().__init__()
21
- dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
21
+ if len(kwargs) == 1 and "recipe" in kwargs:
22
+ dataset_name = kwargs["recipe"]
23
+ else:
24
+ dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
22
25
  self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
23
26
 
24
27
  def evaluate(
@@ -83,9 +86,8 @@ class UnitxtMetric(MetricInterface):
83
86
  )
84
87
  )
85
88
  if non_number_instance_metric_names:
86
- hlog(
87
- "WARNING: Ignored Unitxt instance metrics because "
88
- f"they were not numbers: {non_number_instance_metric_names}"
89
+ hwarn(
90
+ "Ignored Unitxt instance metrics because " f"they were not numbers: {non_number_instance_metric_names}"
89
91
  )
90
92
 
91
93
  # Extract global metrics
@@ -320,8 +320,10 @@ def compute_emd_recursive(
320
320
  mask1 = np.any(sig1[:, 1:-2] != gray_most_frequent_color, axis=1)
321
321
  mask2 = np.any(sig2[:, 1:-2] != gray_most_frequent_color, axis=1)
322
322
  mask = np.logical_or(mask1, mask2)
323
- sig1 = sig1[mask]
324
- sig2 = sig2[mask]
323
+
324
+ if np.any(mask):
325
+ sig1 = sig1[mask]
326
+ sig2 = sig2[mask]
325
327
 
326
328
  # Normalize the weights
327
329
  weight1 = sig1[:, 0]
@@ -324,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
324
324
 
325
325
  # Compute the LPIPS score
326
326
  assert self._lpips_metric is not None
327
- score: float = self._lpips_metric(img1, img2).detach().item()
327
+ score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
328
328
  return score
329
329
 
330
330
  def _calculate_fid(self, act1, act2):
@@ -84,8 +84,8 @@ def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
84
84
  orb = cv2.ORB_create() if hasattr(cv2, "ORB_create") else cv2.ORB()
85
85
 
86
86
  # Find the keypoints and descriptors with ORB
87
- _, desc_a = orb.detectAndCompute(img_a, None)
88
- _, desc_b = orb.detectAndCompute(img_b, None)
87
+ _, desc_a = orb.detectAndCompute(img_a, None) # type: ignore
88
+ _, desc_b = orb.detectAndCompute(img_b, None) # type: ignore
89
89
 
90
90
  # Initialize the brute force matcher
91
91
  bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class WildBenchScoreMetric(Metric):
12
+ """Score metrics for WildBench."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ annotations: Dict[str, Any] = request_state.annotations["wildbench"]
23
+ scores: List[float] = []
24
+ for annotation_key, annotation_value in annotations.items():
25
+ if annotation_key.endswith("_score") and annotation_value is not None:
26
+ scores.append(annotation_value)
27
+ if not scores:
28
+ raise ValueError("Could not compute WB Score because all annotators failed.")
29
+ score = sum(scores) / len(scores)
30
+ score_rescaled = (score - 1) / 9
31
+ return [
32
+ Stat(MetricName("wildbench_score")).add(score),
33
+ Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
34
+ ]
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  import cattrs
5
5
  import yaml
6
6
 
7
- from helm.common.hierarchical_logger import hlog
7
+ from helm.common.hierarchical_logger import hlog, hwarn
8
8
  from helm.common.object_spec import ObjectSpec
9
9
  from helm.benchmark.model_metadata_registry import (
10
10
  ModelMetadata,
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
104
104
  try:
105
105
  model_metadata = get_model_metadata(model_name)
106
106
  except ValueError:
107
- hlog(
108
- f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
109
- )
107
+ hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
110
108
  model_metadata = get_unknown_model_metadata(model_name)
111
109
  register_model_metadata(model_metadata)
112
110
  deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
130
128
  raise ValueError(f"Model deployment {name} not found")
131
129
  deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
132
130
  if deployment.deprecated and warn_deprecated:
133
- hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
131
+ hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
134
132
  return deployment
135
133
 
136
134
 
@@ -182,7 +180,7 @@ def get_default_model_deployment_for_model(
182
180
  deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
183
181
  if deployment.deprecated and ignore_deprecated:
184
182
  if warn_arg_deprecated:
185
- hlog(f"WARNING: Model deployment {model_name} is deprecated")
183
+ hwarn(f"Model deployment {model_name} is deprecated")
186
184
  return None
187
185
  return deployment.name
188
186
 
@@ -193,7 +191,7 @@ def get_default_model_deployment_for_model(
193
191
  if len(available_deployments) > 0:
194
192
  available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
195
193
  if warn_arg_deprecated:
196
- hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
194
+ hwarn("Model name is deprecated. Please use the model deployment name instead.")
197
195
  hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
198
196
 
199
197
  # Additionally, if there is a non-deprecated deployment, use it.
@@ -210,7 +208,7 @@ def get_default_model_deployment_for_model(
210
208
  else:
211
209
  chosen_deployment = available_deployments[0]
212
210
  if warn_arg_deprecated:
213
- hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
211
+ hwarn(f"All model deployments for model {model_name} are deprecated.")
214
212
  if warn_arg_deprecated:
215
213
  hlog(
216
214
  f"Choosing {chosen_deployment.name} (the first one) as "
@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
22
22
  # OpenAI Chat format
23
23
  OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
24
24
 
25
+ # For NOVA models
26
+ NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
27
+
25
28
  # For Anthropic models
26
29
  ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
27
30
  ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
66
69
  LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
67
70
  FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
68
71
 
72
+ # For Audio-langauge models (AudioLMs)
73
+ AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
74
+
69
75
  # Deprecated models that are no longer available.
70
76
  # These are usually closed API models that have been permanently removed
71
77
  DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
72
78
 
79
+ # Unsupported models.
80
+ # These are models that we have chosen not to support because they are
81
+ # private, stale, non-notable, or difficult to implement.
82
+ UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
83
+
73
84
  # Frozen is set to false as the model_deployment_registry.py file
74
85
  # might populate the deployment_names field.
75
86
 
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
208
219
  return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
209
220
 
210
221
 
222
+ def is_audiolm(model_name: str) -> bool:
223
+ """Returns True if the model is a audio-language model (AudioLM). False otherwise."""
224
+ return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
225
+
226
+
211
227
  def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
212
228
  """Return placeholder ModelMetadata for an unknown model."""
213
229
  return ModelMetadata(
@@ -4,7 +4,7 @@ import dacite
4
4
  import importlib_resources as resources
5
5
  import yaml
6
6
 
7
- from helm.common.hierarchical_logger import htrack, hlog
7
+ from helm.common.hierarchical_logger import htrack, hlog, hwarn
8
8
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
9
9
  from helm.benchmark.presentation.schema import Schema
10
10
 
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
71
71
  for point in contamination.points:
72
72
  for model in point.models:
73
73
  if model not in MODEL_NAME_TO_MODEL_METADATA:
74
- hlog(f"WARNING: model {model} not defined in schema")
74
+ hwarn(f"model {model} not defined in schema")
75
75
  for group in point.groups:
76
76
  if group not in schema.name_to_run_group:
77
- hlog(f"WARNING: group {group} not defined in schema")
77
+ hwarn(f"group {group} not defined in schema")
78
78
 
79
79
 
80
80
  def read_contamination():
@@ -11,7 +11,7 @@ import numpy as np
11
11
  from scipy.stats import pearsonr
12
12
 
13
13
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
14
- from helm.common.hierarchical_logger import hlog
14
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
15
15
  from helm.common.optional_dependencies import handle_module_not_found_error
16
16
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
17
17
 
@@ -600,17 +600,7 @@ class Plotter:
600
600
  self.create_constrast_set_plots()
601
601
 
602
602
 
603
- def main():
604
- """
605
- This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
606
- It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
607
- the top-level command `helm-create-plots`.
608
- """
609
- parser = argparse.ArgumentParser()
610
- parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
611
- parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
612
- parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
613
- args = parser.parse_args()
603
+ def create_plots(args):
614
604
  register_builtin_configs_from_helm_package()
615
605
  base_path = os.path.join(args.output_path, "runs", args.suite)
616
606
  if not os.path.exists(os.path.join(base_path, "groups")):
@@ -621,5 +611,36 @@ def main():
621
611
  plotter.create_all_plots()
622
612
 
623
613
 
614
+ def main():
615
+ """
616
+ This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
617
+ It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
618
+ the top-level command `helm-create-plots`.
619
+ """
620
+ parser = argparse.ArgumentParser()
621
+ parser.add_argument(
622
+ "-o",
623
+ "--output-path",
624
+ type=str,
625
+ help="Path to benchmarking output",
626
+ default="benchmark_output",
627
+ )
628
+ parser.add_argument(
629
+ "--suite",
630
+ type=str,
631
+ help="Name of the suite that we are plotting",
632
+ required=True,
633
+ )
634
+ parser.add_argument(
635
+ "--plot-format",
636
+ help="Format for saving plots",
637
+ default="png",
638
+ choices=["png", "pdf"],
639
+ )
640
+ args = parser.parse_args()
641
+ setup_default_logging()
642
+ create_plots(args)
643
+
644
+
624
645
  if __name__ == "__main__":
625
646
  main()
@@ -59,6 +59,9 @@ class DisplayPrediction:
59
59
 
60
60
  annotations: Optional[Dict[str, Any]]
61
61
 
62
+ thinking_text: Optional[str]
63
+ """Thinking text from thinking models."""
64
+
62
65
 
63
66
  @dataclass(frozen=True)
64
67
  class DisplayRequest:
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
266
269
  request_state.instance
267
270
  )
268
271
 
272
+ if request_state.result.completions[0].multimodal_content:
273
+ additional_prediction: str = request_state.result.completions[0].multimodal_content.text
274
+ if additional_prediction:
275
+ predicted_text = f"{additional_prediction} {predicted_text}"
276
+
269
277
  # Process images and include if they exist
270
278
  images: List[str] = [
271
279
  encode_base64(image_location)
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
273
281
  if os.path.exists(image_location)
274
282
  ]
275
283
 
284
+ thinking_text: Optional[str] = (
285
+ request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
286
+ )
287
+
276
288
  predictions.append(
277
289
  DisplayPrediction(
278
290
  instance_id=request_state.instance.id,
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
285
297
  reference_index=request_state.reference_index,
286
298
  stats=trial_stats,
287
299
  annotations=request_state.annotations,
300
+ thinking_text=thinking_text,
288
301
  )
289
302
  )
290
303
  requests.append(
@@ -11,6 +11,7 @@ import importlib_resources as resources
11
11
  from helm.common.general import hlog
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
14
+ from helm.common.hierarchical_logger import hwarn
14
15
 
15
16
 
16
17
  # TODO: change to `helm.benchmark.config`
@@ -281,5 +282,5 @@ def read_schema(schema_path: str) -> Schema:
281
282
  raw = yaml.safe_load(f)
282
283
  schema = dacite.from_dict(Schema, raw)
283
284
  if schema.adapter:
284
- hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
285
+ hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
285
286
  return dataclasses.replace(schema, adapter=get_adapter_fields())
@@ -30,7 +30,7 @@ from helm.common.general import (
30
30
  unique_simplification,
31
31
  )
32
32
  from helm.common.codec import from_json
33
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
33
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
34
34
  from helm.benchmark.scenarios.scenario import ScenarioSpec
35
35
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
36
36
  from helm.benchmark.metrics.metric_name import MetricName
@@ -52,6 +52,10 @@ from helm.benchmark.presentation.schema import (
52
52
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
53
53
  from helm.benchmark.presentation.run_display import write_run_display_json
54
54
  from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
55
+ from helm.common.object_spec import get_class_by_name
56
+
57
+
58
+ MODEL_HEADER_CELL_VALUE = "Model"
55
59
 
56
60
 
57
61
  @dataclass(frozen=True)
@@ -98,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
98
102
  # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
99
103
  # as the main metric but multiple_choice_separate_original only generates exact_match
100
104
  if matcher.name == "quasi_exact_match":
101
- hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
105
+ hwarn("No quasi_exact_match metric found, looking for exact_match instead")
102
106
  matcher = replace(matcher, name="exact_match")
103
107
  matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
104
108
  if len(matching_stats) == 0:
@@ -262,18 +266,22 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
262
266
  """
263
267
 
264
268
  row_means: List[Optional[float]] = []
269
+ # if the first column contains the names of models, do not treat it like a value column
270
+ skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
265
271
 
266
272
  # check for all header cells where specified, that lower_is_better is consistent
267
273
  orderings = []
268
- for elem in table.header:
269
- orderings.append(elem.lower_is_better)
274
+ header_cells = table.header[1:] if skip_first_column else table.header
275
+ for header_cell in header_cells:
276
+ orderings.append(header_cell.lower_is_better)
270
277
  if len(set(orderings)) != 1:
271
278
  raise Exception("Cannot mean columns with different values for lower_is_better")
272
279
 
273
280
  for row in table.rows:
274
281
  total = 0.0
275
282
  count = 0
276
- for cell in row:
283
+ row_cells = row[1:] if skip_first_column else row
284
+ for cell in row_cells:
277
285
  if cell.value is not None:
278
286
  total += float(cell.value)
279
287
  count += 1
@@ -398,8 +406,8 @@ class Summarizer:
398
406
  included = False
399
407
  for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
400
408
  if run_group_name not in self.schema.name_to_run_group:
401
- hlog(
402
- f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
409
+ hwarn(
410
+ f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
403
411
  f"but undefined in {self.schema_path}, skipping"
404
412
  )
405
413
  continue
@@ -432,14 +440,14 @@ class Summarizer:
432
440
  run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
433
441
  stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
434
442
  if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
435
- hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
443
+ hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
436
444
  continue
437
445
  run_path: str = os.path.join(run_suite_path, run_dir_name)
438
446
  run = self.read_run(run_path)
439
447
  self.runs.append(run)
440
448
  if run.run_spec.name in self.runs_to_run_suites:
441
- hlog(
442
- f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
449
+ hwarn(
450
+ f"Run entry {run.run_spec.name} is present in two different Run Suites. "
443
451
  f"Defaulting to the latest assigned suite: {suite}"
444
452
  )
445
453
  self.runs_to_run_suites[run.run_spec.name] = suite
@@ -536,8 +544,8 @@ class Summarizer:
536
544
 
537
545
  for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
538
546
  if metric_name not in defined_metric_names:
539
- hlog(
540
- f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
547
+ hwarn(
548
+ f"metric name {metric_name} undefined in {self.schema_path} "
541
549
  f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
542
550
  )
543
551
 
@@ -730,8 +738,8 @@ class Summarizer:
730
738
  if stat is None:
731
739
  # Print out near misses to provide a more informative warning
732
740
  near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
733
- hlog(
734
- f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
741
+ hwarn(
742
+ f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
735
743
  f"{len(near_misses)} near misses matching just the name"
736
744
  )
737
745
  if len(near_misses) > 0:
@@ -802,7 +810,7 @@ class Summarizer:
802
810
  # Create header (cells to display) and the list of metric name filters
803
811
  # (to pull out information later).
804
812
  if not columns or not adapter_to_runs:
805
- hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
813
+ hwarn(f"table {title}, has no rows or columns, leaving empty")
806
814
  return Table("empty", [], [])
807
815
 
808
816
  header: List[HeaderCell] = []
@@ -811,7 +819,7 @@ class Summarizer:
811
819
  num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
812
820
 
813
821
  # Column headers
814
- header.append(HeaderCell("Model/adapter"))
822
+ header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
815
823
  for run_group, metric_group_name in columns:
816
824
  # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
817
825
  if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -823,7 +831,7 @@ class Summarizer:
823
831
  matcher = replace(matcher, sub_split=sub_split)
824
832
  header_field = self.schema.name_to_metric.get(matcher.name)
825
833
  if header_field is None:
826
- hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
834
+ hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
827
835
  continue
828
836
  metadata = {
829
837
  "metric": header_field.get_short_display_name(),
@@ -951,8 +959,8 @@ class Summarizer:
951
959
  all_run_spec_names = []
952
960
  for adapter_spec, runs in adapter_to_runs.items():
953
961
  if len(runs) > 1:
954
- hlog(
955
- f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
962
+ hwarn(
963
+ f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
956
964
  f" {[run.run_spec.name for run in runs]}"
957
965
  )
958
966
  for run in runs:
@@ -969,22 +977,20 @@ class Summarizer:
969
977
  if strategy == AggregationStrategy.WIN_RATE:
970
978
  WIN_RATE_AGGREGATION = "mean"
971
979
  win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
972
- description = "How many models this model outperforms on average (over columns)."
973
980
  aggregate_header_cells.append(
974
981
  HeaderCell(
975
982
  f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
976
- description=description,
983
+ description="How many models this model outperforms on average (over columns).",
977
984
  lower_is_better=False,
978
985
  )
979
986
  )
980
987
  aggregate_row_values.append(win_rates)
981
988
  elif strategy == AggregationStrategy.MEAN:
982
989
  means = compute_aggregate_row_means(table)
983
- description = "An average over columns representing the mean performance."
984
990
  aggregate_header_cells.append(
985
991
  HeaderCell(
986
- "Mean performance",
987
- description=description,
992
+ "Mean score",
993
+ description="The mean of the scores from all columns.",
988
994
  lower_is_better=table.header[0].lower_is_better,
989
995
  )
990
996
  )
@@ -1226,10 +1232,57 @@ class Summarizer:
1226
1232
 
1227
1233
 
1228
1234
  @htrack("summarize")
1235
+ def summarize(args):
1236
+ release: Optional[str] = None
1237
+ suites: Optional[str] = None
1238
+ suite: Optional[str] = None
1239
+ if args.suite and (args.release or args.suites):
1240
+ raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1241
+ elif args.suite:
1242
+ # Comment this out while we have a trial period for the `release` method.
1243
+ # hlog(
1244
+ # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1245
+ # "where --release specifies the name of a release and --suites specifies several run suites "
1246
+ # "to be included in that release."
1247
+ # )
1248
+ suite = args.suite
1249
+ elif args.release or args.suites:
1250
+ if not args.release or not args.suites:
1251
+ raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1252
+ release = args.release
1253
+ suites = args.suites
1254
+ else:
1255
+ raise ValueError("Exactly one of --release or --suite must be specified.")
1256
+
1257
+ schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1258
+
1259
+ register_builtin_configs_from_helm_package()
1260
+ register_configs_from_directory(args.local_path)
1261
+
1262
+ # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1263
+ summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1264
+ summarizer = summarizer_cls(
1265
+ release=release,
1266
+ suites=suites,
1267
+ suite=suite,
1268
+ schema_path=schema_path,
1269
+ output_path=args.output_path,
1270
+ verbose=args.debug,
1271
+ num_threads=args.num_threads,
1272
+ allow_unknown_models=args.allow_unknown_models,
1273
+ )
1274
+ summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1275
+ hlog("Done.")
1276
+
1277
+
1229
1278
  def main():
1230
1279
  parser = argparse.ArgumentParser()
1231
1280
  parser.add_argument(
1232
- "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
1281
+ "-o",
1282
+ "--output-path",
1283
+ type=str,
1284
+ help="Where the benchmarking output lives",
1285
+ default="benchmark_output",
1233
1286
  )
1234
1287
  parser.add_argument(
1235
1288
  "--schema-path",
@@ -1247,9 +1300,18 @@ def main():
1247
1300
  help="Experimental: Name of the release this summarization should go under.",
1248
1301
  )
1249
1302
  parser.add_argument(
1250
- "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
1303
+ "--suites",
1304
+ type=str,
1305
+ nargs="+",
1306
+ help="Experimental: List of suites to summarize for this this release.",
1307
+ )
1308
+ parser.add_argument(
1309
+ "-n",
1310
+ "--num-threads",
1311
+ type=int,
1312
+ help="Max number of threads used to summarize",
1313
+ default=8,
1251
1314
  )
1252
- parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
1253
1315
  parser.add_argument(
1254
1316
  "--debug",
1255
1317
  action="store_true",
@@ -1272,47 +1334,15 @@ def main():
1272
1334
  help="Whether to allow unknown models in the metadata file",
1273
1335
  default=True,
1274
1336
  )
1275
- args = parser.parse_args()
1276
-
1277
- release: Optional[str] = None
1278
- suites: Optional[str] = None
1279
- suite: Optional[str] = None
1280
- if args.suite and (args.release or args.suites):
1281
- raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1282
- elif args.suite:
1283
- # Comment this out while we have a trial period for the `release` method.
1284
- # hlog(
1285
- # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1286
- # "where --release specifies the name of a release and --suites specifies several run suites "
1287
- # "to be included in that release."
1288
- # )
1289
- suite = args.suite
1290
- elif args.release or args.suites:
1291
- if not args.release or not args.suites:
1292
- raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1293
- release = args.release
1294
- suites = args.suites
1295
- else:
1296
- raise ValueError("Exactly one of --release or --suite must be specified.")
1297
-
1298
- schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1299
-
1300
- register_builtin_configs_from_helm_package()
1301
- register_configs_from_directory(args.local_path)
1302
-
1303
- # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1304
- summarizer = Summarizer(
1305
- release=release,
1306
- suites=suites,
1307
- suite=suite,
1308
- schema_path=schema_path,
1309
- output_path=args.output_path,
1310
- verbose=args.debug,
1311
- num_threads=args.num_threads,
1312
- allow_unknown_models=args.allow_unknown_models,
1337
+ parser.add_argument(
1338
+ "--summarizer-class-name",
1339
+ type=str,
1340
+ default=None,
1341
+ help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1313
1342
  )
1314
- summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1315
- hlog("Done.")
1343
+ args = parser.parse_args()
1344
+ setup_default_logging()
1345
+ summarize(args)
1316
1346
 
1317
1347
 
1318
1348
  if __name__ == "__main__":