crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -9,12 +9,11 @@ Usage:
9
9
  """
10
10
 
11
11
  import argparse
12
- import cattrs
12
+ import dataclasses
13
13
  import os
14
14
  import datetime
15
15
  import urllib.parse
16
16
  import json
17
- import yaml
18
17
  from collections import defaultdict
19
18
  from dataclasses import dataclass, replace
20
19
  from statistics import mean, median
@@ -22,50 +21,50 @@ from typing import List, Optional, Dict, Any, Tuple, Set
22
21
 
23
22
  from tqdm import tqdm
24
23
  from helm.benchmark.model_deployment_registry import get_model_deployment
25
-
26
24
  from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
27
25
  from helm.common.general import (
28
26
  write,
29
27
  ensure_directory_exists,
30
28
  asdict_without_nones,
31
- serialize_dates,
32
29
  parallel_map,
33
30
  singleton,
34
31
  unique_simplification,
35
32
  )
36
33
  from helm.common.codec import from_json
37
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
38
- from helm.benchmark.scenarios.scenario import ScenarioSpec
34
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
35
+ from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
39
36
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
40
- from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
41
- from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
42
37
  from helm.benchmark.metrics.metric_name import MetricName
43
- from helm.benchmark.metrics.metric import get_all_stats_by_name
38
+ from helm.benchmark.metrics.metric import (
39
+ MetricInterface,
40
+ MetricMetadata,
41
+ MetricSpec,
42
+ create_metric,
43
+ get_all_stats_by_name,
44
+ )
44
45
  from helm.benchmark.metrics.statistic import Stat, merge_stat
45
- from helm.benchmark.runner import RunSpec, LATEST_SYMLINK
46
+ from helm.benchmark.run_spec import RunSpec
47
+ from helm.benchmark.runner import LATEST_SYMLINK
46
48
  from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
47
49
  from helm.benchmark.presentation.schema import (
50
+ MetricGroup,
48
51
  MetricNameMatcher,
49
52
  RunGroup,
53
+ Field,
54
+ Schema,
50
55
  read_schema,
51
- SCHEMA_CLASSIC_YAML_FILENAME,
56
+ get_default_schema_path,
52
57
  BY_GROUP,
53
58
  THIS_GROUP_ONLY,
54
59
  NO_GROUPS,
55
60
  )
56
- from helm.benchmark.presentation.contamination import (
57
- read_contamination,
58
- validate_contamination,
59
- CONTAMINATION_SYMBOLS,
60
- CONTAMINATION_STYLES,
61
- CONTAMINATION_LEVEL_STRONG,
62
- )
63
61
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
64
62
  from helm.benchmark.presentation.run_display import write_run_display_json
65
- from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
63
+ from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
64
+ from helm.common.object_spec import get_class_by_name
66
65
 
67
66
 
68
- OVERLAP_N_COUNT = 13
67
+ MODEL_HEADER_CELL_VALUE = "Model"
69
68
 
70
69
 
71
70
  @dataclass(frozen=True)
@@ -112,7 +111,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
112
111
  # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
113
112
  # as the main metric but multiple_choice_separate_original only generates exact_match
114
113
  if matcher.name == "quasi_exact_match":
115
- hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
114
+ hwarn("No quasi_exact_match metric found, looking for exact_match instead")
116
115
  matcher = replace(matcher, name="exact_match")
117
116
  matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
118
117
  if len(matching_stats) == 0:
@@ -172,7 +171,7 @@ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetad
172
171
  except ValueError:
173
172
  pass
174
173
 
175
- # Return a placeholder "unknoown model" model metadata.
174
+ # Return a placeholder "unknown model" model metadata.
176
175
  return get_unknown_model_metadata(adapter_spec.model)
177
176
 
178
177
 
@@ -233,25 +232,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
233
232
  """
234
233
  assert aggregation in ["mean", "median"]
235
234
  win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
236
- for i, header_cell in enumerate(table.header):
235
+ for column_index, header_cell in enumerate(table.header):
237
236
  lower_is_better = header_cell.lower_is_better
238
237
  if lower_is_better is None: # column does not have a meaningful ordering
239
238
  continue
240
-
241
- # sort row indices by cell value and then compute the number of wins as the index in the sorted list
242
- def is_cell_valid(cell: Cell) -> bool: # ignore cells which are strongly contaminated or have no value
243
- if cell.value is None:
244
- return False
245
- if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
246
- return False
247
- return True
248
-
249
- values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
250
- if len(values) < 2: # don't rank a single model
239
+ value_to_count: Dict[float, int] = defaultdict(int)
240
+ for row in table.rows:
241
+ value = row[column_index].value
242
+ if value is not None:
243
+ value_to_count[value] += 1
244
+ value_to_wins: Dict[float, float] = {}
245
+ acc_count = 0
246
+ for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
247
+ value_to_wins[value] = acc_count + ((value_count - 1) / 2)
248
+ acc_count += value_count
249
+ total_count = acc_count
250
+ if total_count < 2:
251
251
  continue
252
- for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
253
- win_rate = wins / (len(values) - 1) # normalize to [0, 1]
254
- win_rates_per_row[j].append(win_rate)
252
+ for row_index, row in enumerate(table.rows):
253
+ value = row[column_index].value
254
+ if value is not None:
255
+ win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
255
256
 
256
257
  # Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
257
258
  # Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
@@ -266,7 +267,47 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
266
267
  return aggregate_win_rates
267
268
 
268
269
 
269
- AGGREGATE_WIN_RATE_COLUMN = 1
270
+ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
271
+ """
272
+ Computes the aggregate mean of each row across columns.
273
+ Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
274
+ non-null values of the row are in columns we skip).
275
+ """
276
+
277
+ row_means: List[Optional[float]] = []
278
+ # if the first column contains the names of models, do not treat it like a value column
279
+ skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
280
+
281
+ # check for all header cells where specified, that lower_is_better is consistent
282
+ orderings = []
283
+ header_cells = table.header[1:] if skip_first_column else table.header
284
+ for header_cell in header_cells:
285
+ orderings.append(header_cell.lower_is_better)
286
+ if len(set(orderings)) != 1:
287
+ raise Exception("Cannot mean columns with different values for lower_is_better")
288
+
289
+ for row in table.rows:
290
+ total = 0.0
291
+ count = 0
292
+ row_cells = row[1:] if skip_first_column else row
293
+ for cell in row_cells:
294
+ if cell.value is not None:
295
+ total += float(cell.value)
296
+ count += 1
297
+ if count == 0:
298
+ row_means.append(None)
299
+ else:
300
+ row_means.append(total / count)
301
+
302
+ return row_means
303
+
304
+
305
+ class AggregationStrategy:
306
+ WIN_RATE = "win_rate"
307
+ MEAN = "mean"
308
+
309
+
310
+ ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
270
311
 
271
312
 
272
313
  class Summarizer:
@@ -309,7 +350,7 @@ class Summarizer:
309
350
  release: Optional[str],
310
351
  suites: Optional[List[str]],
311
352
  suite: Optional[str],
312
- schema_file: str,
353
+ schema_path: Optional[str],
313
354
  output_path: str,
314
355
  verbose: bool,
315
356
  num_threads: int,
@@ -329,7 +370,7 @@ class Summarizer:
329
370
  self.suites: List[str]
330
371
  self.run_suite_paths: List[str]
331
372
  self.suite: Optional[str] = None
332
- self.schema_file = schema_file
373
+ self.schema_path = schema_path
333
374
  self.release: Optional[str] = None
334
375
  if suite:
335
376
  self.suite = suite
@@ -344,12 +385,8 @@ class Summarizer:
344
385
  self.verbose: bool = verbose
345
386
  self.num_threads: int = num_threads
346
387
  self.allow_unknown_models: bool = allow_unknown_models
347
-
348
- ensure_directory_exists(self.run_release_path)
349
-
350
- self.schema = read_schema(schema_file)
351
- self.contamination = read_contamination()
352
- validate_contamination(self.contamination, self.schema)
388
+ self.schema = read_schema(schema_path) if schema_path else Schema()
389
+ self.metric_metadata: List[MetricMetadata] = []
353
390
 
354
391
  def read_run(self, run_path: str) -> Run:
355
392
  """Load the `Run` object from `run_path`."""
@@ -375,9 +412,9 @@ class Summarizer:
375
412
  included = False
376
413
  for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
377
414
  if run_group_name not in self.schema.name_to_run_group:
378
- hlog(
379
- f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
380
- f"but undefined in {self.schema_file}, skipping"
415
+ hwarn(
416
+ f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
417
+ f"but undefined in {self.schema_path}, skipping"
381
418
  )
382
419
  continue
383
420
  run_group = self.schema.name_to_run_group[run_group_name]
@@ -396,6 +433,8 @@ class Summarizer:
396
433
 
397
434
  def read_runs_for_suite(self, suite, run_suite_path):
398
435
  """Load the runs in the run suite path."""
436
+ if not os.path.exists(run_suite_path):
437
+ raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
399
438
  # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
400
439
  # so filter them out.
401
440
  run_dir_names = sorted(
@@ -409,14 +448,14 @@ class Summarizer:
409
448
  run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
410
449
  stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
411
450
  if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
412
- hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
451
+ hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
413
452
  continue
414
453
  run_path: str = os.path.join(run_suite_path, run_dir_name)
415
454
  run = self.read_run(run_path)
416
455
  self.runs.append(run)
417
456
  if run.run_spec.name in self.runs_to_run_suites:
418
- hlog(
419
- f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
457
+ hwarn(
458
+ f"Run entry {run.run_spec.name} is present in two different Run Suites. "
420
459
  f"Defaulting to the latest assigned suite: {suite}"
421
460
  )
422
461
  self.runs_to_run_suites[run.run_spec.name] = suite
@@ -433,11 +472,205 @@ class Summarizer:
433
472
  self.group_adapter_to_runs[group_name][adapter_spec].append(run)
434
473
  self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
435
474
 
436
- def write_schema(self):
475
+ @dataclass(frozen=True)
476
+ class _ModelField(Field):
477
+ """The frontend version of ModelMetadata.
478
+
479
+ The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
480
+
481
+ All attributes have the same meaning as in ModelMetadata."""
482
+
483
+ # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
484
+ creator_organization: Optional[str] = None
485
+ access: Optional[str] = None
486
+ todo: bool = False
487
+ release_date: Optional[str] = None
488
+ num_parameters: Optional[int] = None
489
+
490
+ def get_model_field_dicts(self) -> List[Dict]:
491
+ """Get a list of `ModelField`s dicts that will be written to schema.json.
492
+
493
+ The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
494
+
495
+ This is populated by reading the `ModelMetadata` configs and filtering down to models that were
496
+ actually used, and converting each `ModelMetadata` to a `ModelField`."""
497
+ # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
498
+ used_model_names: Set[str] = set()
499
+ for run in self.runs:
500
+ used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
501
+
502
+ model_field_dicts: List[Dict] = []
503
+ for model_name in get_all_models():
504
+ if model_name not in used_model_names:
505
+ continue
506
+ model_metadata = get_model_metadata(model_name)
507
+ model_field = Summarizer._ModelField(
508
+ name=model_metadata.name,
509
+ display_name=model_metadata.display_name,
510
+ short_display_name=model_metadata.display_name,
511
+ description=model_metadata.description,
512
+ creator_organization=model_metadata.creator_organization_name,
513
+ access=model_metadata.access,
514
+ todo=False,
515
+ release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
516
+ num_parameters=model_metadata.num_parameters,
517
+ )
518
+ model_field_dicts.append(asdict_without_nones(model_field))
519
+ return model_field_dicts
520
+
521
+ def get_metric_metadata(self) -> List[MetricMetadata]:
522
+ if self.metric_metadata:
523
+ return self.metric_metadata
524
+ metric_specs: List[MetricSpec] = []
525
+ for run in self.runs:
526
+ metric_specs.extend(run.run_spec.metric_specs)
527
+ metric_specs = list(set(metric_specs))
528
+ metric_name_to_metadata: Dict[str, MetricMetadata] = {}
529
+ for metric_spec in metric_specs:
530
+ try:
531
+ metric: MetricInterface = create_metric(metric_spec)
532
+ metric_metadata_list = metric.get_metadata()
533
+ for metric_metadata in metric_metadata_list:
534
+ metric_name_to_metadata[metric_metadata.name] = metric_metadata
535
+ except NotImplementedError:
536
+ pass
537
+ except (ModuleNotFoundError, AttributeError, TypeError):
538
+ pass
539
+
540
+ run_stat_names: Set[str] = set()
541
+ for run in self.runs:
542
+ for stat in run.stats:
543
+ run_stat_names.add(stat.name.name)
544
+
545
+ metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
546
+ for metric_name_to_prune in metric_names_to_prune:
547
+ del metric_name_to_metadata[metric_name_to_prune]
548
+ self.metric_metadata = list(metric_name_to_metadata.values())
549
+ return self.metric_metadata
550
+
551
+ def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
552
+ return Field(
553
+ name=metric_metadata.name,
554
+ display_name=metric_metadata.display_name,
555
+ short_display_name=metric_metadata.short_display_name,
556
+ description=metric_metadata.description,
557
+ lower_is_better=metric_metadata.lower_is_better,
558
+ )
559
+
560
+ def auto_generate_metric_fields(self) -> List[Field]:
561
+ return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
562
+
563
+ def auto_generate_metric_groups(self) -> List[MetricGroup]:
564
+ metric_groups = [
565
+ MetricGroup(
566
+ name="main_metric",
567
+ display_name="Main Metric",
568
+ description="Main Metric",
569
+ metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
570
+ )
571
+ ]
572
+ metric_group_to_metrics: Dict[str, List[str]] = {}
573
+ for metric_metadata in self.metric_metadata:
574
+ if metric_metadata.group:
575
+ if metric_metadata.group not in metric_group_to_metrics:
576
+ metric_group_to_metrics[metric_metadata.group] = []
577
+ metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
578
+ for metric_group, metric_names in metric_group_to_metrics.items():
579
+ display_name = metric_group.replace("_", " ").capitalize()
580
+ metric_groups.append(
581
+ MetricGroup(
582
+ name=metric_group,
583
+ # TODO: Make display_name and description nicer
584
+ display_name=display_name,
585
+ description=display_name,
586
+ aggregation_strategies=[],
587
+ metrics=[
588
+ MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
589
+ ],
590
+ )
591
+ )
592
+ return metric_groups
593
+
594
+ def get_scenario_metadata(self) -> List[ScenarioMetadata]:
595
+ scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
596
+ scenario_specs = list(set(scenario_specs))
597
+ scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
598
+ for scenario_spec in scenario_specs:
599
+ try:
600
+ scenario: Scenario = create_scenario(scenario_spec)
601
+ scenario_metadata = scenario.get_metadata()
602
+ scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
603
+ except NotImplementedError:
604
+ pass
605
+ except (ModuleNotFoundError, AttributeError, TypeError):
606
+ pass
607
+
608
+ run_groups: Set[str] = set()
609
+ for run in self.runs:
610
+ for run_group in run.run_spec.groups:
611
+ run_groups.add(run_group)
612
+
613
+ scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
614
+ for scenario_name_to_prune in scenario_names_to_prune:
615
+ del scenario_name_to_metadata[scenario_name_to_prune]
616
+ return list(scenario_name_to_metadata.values())
617
+
618
+ def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
619
+ metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
620
+ return RunGroup(
621
+ name=scenario_metadata.name,
622
+ display_name=scenario_metadata.display_name,
623
+ short_display_name=scenario_metadata.short_display_name,
624
+ description=scenario_metadata.description,
625
+ metric_groups=metric_group_names,
626
+ environment={
627
+ "main_name": scenario_metadata.main_metric,
628
+ "main_split": scenario_metadata.main_split,
629
+ },
630
+ taxonomy=scenario_metadata.taxonomy,
631
+ )
632
+
633
+ def auto_generate_all_scenarios_run_group(self) -> RunGroup:
634
+ return RunGroup(
635
+ name="all_scenarios",
636
+ display_name="All Scenarios",
637
+ description="All scenarios",
638
+ category="Scenario Groups",
639
+ subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
640
+ )
641
+
642
+ def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
643
+ return [
644
+ self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
645
+ ]
646
+
647
+ def fix_up_schema(self) -> None:
648
+ # if not self.schema.run_groups:
649
+ if not self.schema.metrics:
650
+ self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
651
+ # Can only auto-generate metric groups if metrics were also auto-generated
652
+ # because auto_generate_metric_groups() requires self.metric_metadata()
653
+ # which is populated by auto_generate_metric_fields()
654
+ if not self.schema.metric_groups:
655
+ self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
656
+ if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
657
+ self.schema = dataclasses.replace(
658
+ self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
659
+ )
660
+ if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
661
+ self.schema = dataclasses.replace(
662
+ self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
663
+ )
664
+
665
+ def write_schema(self) -> None:
437
666
  """Write the schema file to benchmark_output so the frontend knows about it."""
667
+ # Manually add the model metadata to the schema.json, where the frontend expects it.
668
+ # TODO: Move model metadata out of schema.json into its own model_metadata.json file.
669
+ raw_schema = asdict_without_nones(self.schema)
670
+ raw_schema["models"] = self.get_model_field_dicts()
438
671
  write(
439
672
  os.path.join(self.run_release_path, "schema.json"),
440
- json.dumps(asdict_without_nones(self.schema), indent=2, default=serialize_dates),
673
+ json.dumps(raw_schema, indent=2),
441
674
  )
442
675
 
443
676
  def read_runs(self):
@@ -450,136 +683,6 @@ class Summarizer:
450
683
  for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
451
684
  self.read_runs_for_suite(suite, run_suite_path)
452
685
 
453
- def read_overlap_stats(self):
454
- """
455
- Load the overlap stats in the run suite path.
456
- Concretely:
457
- - get group -> scenario_spec information from self.runs
458
- run_spec data
459
- - read the files in the data_overlap directory in run_suite_path
460
- which are scenario_spec -> overlap ids
461
- - get aggregate stats for group -> overlap ratio
462
- """
463
-
464
- def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
465
- scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
466
- for run_spec in run_specs:
467
- scenario_spec = run_spec.scenario_spec
468
- groups = run_spec.groups
469
- if (
470
- scenario_spec.class_name
471
- != "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
472
- ):
473
- scenario_specs_to_groups[scenario_spec] = groups
474
-
475
- group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
476
- for scenario_spec, groups in scenario_specs_to_groups.items():
477
- for group in groups:
478
- if group not in group_to_scenario_specs:
479
- group_to_scenario_specs[group] = []
480
- group_to_scenario_specs[group].append(scenario_spec)
481
- return group_to_scenario_specs
482
-
483
- def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
484
- """
485
- Takes the data_overlap_dir as input and returns a dictionary
486
- of stats_file_path -> List(model_names)
487
-
488
- Sample input:
489
- file_models_mapping:
490
- - file_name: file1
491
- model_names:
492
- - model1
493
- - model2
494
- - file_name: file2
495
- model_names:
496
- - model2
497
- - model3
498
-
499
- """
500
- metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
501
- if not os.path.exists(metadata_file_path):
502
- return {}
503
-
504
- with open(metadata_file_path, "r") as yaml_file:
505
- data = yaml.safe_load(yaml_file)
506
-
507
- file_metadata: Dict[str, List[str]] = {}
508
- for entry in data["file_models_mapping"]:
509
- if "file_name" in entry and "model_names" in entry:
510
- file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
511
- file_metadata[file_path] = entry["model_names"]
512
-
513
- return file_metadata
514
-
515
- self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
516
-
517
- data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
518
- if not os.path.isdir(data_overlap_dir):
519
- hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
520
- return
521
-
522
- group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
523
-
524
- stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
525
-
526
- for file_path, model_names in stats_file_metadata.items():
527
- overlap_stats_jsons = open(file_path, "r").readlines()
528
-
529
- data_overlap_stats_list: List[DataOverlapStats] = []
530
- for overlap_stats_json in overlap_stats_jsons:
531
- overlap_stats_dict = json.loads(overlap_stats_json)
532
- data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
533
-
534
- scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
535
- for data_overlap_stats in data_overlap_stats_list:
536
- data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
537
- n = data_overlap_stats_key.overlap_protocol_spec.n
538
- if n == OVERLAP_N_COUNT:
539
- light_scenario_key = data_overlap_stats_key.light_scenario_key
540
- scenario_spec = light_scenario_key.scenario_spec
541
- if scenario_spec in self.scenario_spec_instance_id_dict:
542
- # Get statistics based on the subset of instance_ids that HELM uses for a scenario
543
- instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
544
- num_instances = len(instance_ids)
545
- num_overlapping_inputs = len(
546
- set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
547
- )
548
- num_overlapping_references = len(
549
- set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
550
- )
551
- scenario_spec_overlap_counts[scenario_spec] = (
552
- num_instances,
553
- num_overlapping_inputs,
554
- num_overlapping_references,
555
- )
556
-
557
- for group, scenario_specs in group_to_scenario_specs.items():
558
- group_num_instances = 0
559
- group_num_overlapping_inputs = 0
560
- group_num_overlapping_references = 0
561
- for scenario_spec in scenario_specs:
562
- if scenario_spec in scenario_spec_overlap_counts:
563
- (
564
- num_instances,
565
- num_overlapping_inputs,
566
- num_overlapping_references,
567
- ) = scenario_spec_overlap_counts[scenario_spec]
568
- group_num_instances += num_instances
569
- group_num_overlapping_inputs += num_overlapping_inputs
570
- group_num_overlapping_references += num_overlapping_references
571
- if group_num_instances != 0:
572
- group_overlap_stats = GroupOverlapStats(
573
- group=group,
574
- num_instances=group_num_instances,
575
- num_overlapping_inputs=group_num_overlapping_inputs,
576
- num_overlapping_references=group_num_overlapping_references,
577
- )
578
- for model_name in model_names:
579
- # Assume model name will only be associated with single group overlap list for now
580
- # can update to join lists if need arises
581
- self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
582
-
583
686
  @htrack(None)
584
687
  def check_metrics_defined(self):
585
688
  """Check that all the metrics that appear in stats are defined."""
@@ -593,8 +696,8 @@ class Summarizer:
593
696
 
594
697
  for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
595
698
  if metric_name not in defined_metric_names:
596
- hlog(
597
- f"WARNING: metric name {metric_name} undefined in {self.schema_file} "
699
+ hwarn(
700
+ f"metric name {metric_name} undefined in {self.schema_path} "
598
701
  f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
599
702
  )
600
703
 
@@ -726,9 +829,6 @@ class Summarizer:
726
829
  num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
727
830
  num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
728
831
 
729
- if len(num_instances) == 0:
730
- continue
731
-
732
832
  rows.append(
733
833
  [
734
834
  Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
@@ -762,9 +862,9 @@ class Summarizer:
762
862
  self,
763
863
  runs: List[Run],
764
864
  matcher: MetricNameMatcher,
765
- contamination_level: Optional[str],
766
865
  additional_info: Optional[str],
767
866
  hide_value: bool = False,
867
+ is_scenario_table: bool = False,
768
868
  ) -> Cell:
769
869
  """
770
870
  Use the metric name identified by `matcher` to pull out the stats from
@@ -790,8 +890,8 @@ class Summarizer:
790
890
  if stat is None:
791
891
  # Print out near misses to provide a more informative warning
792
892
  near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
793
- hlog(
794
- f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
893
+ hwarn(
894
+ f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
795
895
  f"{len(near_misses)} near misses matching just the name"
796
896
  )
797
897
  if len(near_misses) > 0:
@@ -818,23 +918,38 @@ class Summarizer:
818
918
  if self.verbose:
819
919
  description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
820
920
 
821
- style: Dict[str, Any] = {}
822
- if contamination_level is not None:
823
- style = CONTAMINATION_STYLES.get(contamination_level, style)
824
-
825
- return Cell(value=value, description=description, style=style, contamination_level=contamination_level)
921
+ # Link the runs that this cell was aggregated from, if this is not a scenario table.
922
+ # Scenario tables link to the runs in the model cells,
923
+ # whereas non-scenario tables link to the runs in the metrics cells.
924
+ run_spec_names: Optional[List] = None
925
+ if not is_scenario_table:
926
+ # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
927
+ # run specs if a run spec belongs to multiple groups.
928
+ run_spec_names = []
929
+ run_spec_names_set = set()
930
+ for run_spec_name in aggregated_run_specs:
931
+ if run_spec_name not in run_spec_names_set:
932
+ run_spec_names.append(run_spec_name)
933
+ run_spec_names_set.add(run_spec_name)
934
+
935
+ return Cell(
936
+ value=value,
937
+ description=description,
938
+ style={},
939
+ run_spec_names=run_spec_names,
940
+ )
826
941
 
827
942
  def create_group_table(
828
943
  self,
829
944
  name: str,
830
945
  title: str,
831
946
  adapter_to_runs: Dict[AdapterSpec, List[Run]],
832
- link_to_runs: bool,
947
+ is_scenario_table: bool,
833
948
  columns: List[Tuple[RunGroup, str]], # run_group, metric_group
834
949
  sort_by_model_order: bool = True,
835
950
  sub_split: Optional[str] = None,
836
951
  bold_columns: bool = True,
837
- add_win_rate: bool = False,
952
+ aggregation_strategies: List[str] = [],
838
953
  ) -> Table:
839
954
  """
840
955
  Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -847,7 +962,7 @@ class Summarizer:
847
962
  # Create header (cells to display) and the list of metric name filters
848
963
  # (to pull out information later).
849
964
  if not columns or not adapter_to_runs:
850
- hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
965
+ hwarn(f"table {title}, has no rows or columns, leaving empty")
851
966
  return Table("empty", [], [])
852
967
 
853
968
  header: List[HeaderCell] = []
@@ -856,7 +971,7 @@ class Summarizer:
856
971
  num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
857
972
 
858
973
  # Column headers
859
- header.append(HeaderCell("Model/adapter"))
974
+ header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
860
975
  for run_group, metric_group_name in columns:
861
976
  # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
862
977
  if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -868,7 +983,7 @@ class Summarizer:
868
983
  matcher = replace(matcher, sub_split=sub_split)
869
984
  header_field = self.schema.name_to_metric.get(matcher.name)
870
985
  if header_field is None:
871
- hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_file}, skipping")
986
+ hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
872
987
  continue
873
988
  metadata = {
874
989
  "metric": header_field.get_short_display_name(),
@@ -876,7 +991,8 @@ class Summarizer:
876
991
  }
877
992
 
878
993
  header_name = header_field.get_short_display_name()
879
- description = (run_group.description + "\n\n" if run_group.description is not None else "") + (
994
+ run_group_short_description = run_group.short_description or run_group.description or ""
995
+ description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
880
996
  (header_field.display_name if header_field.display_name else header_field.name)
881
997
  + ": "
882
998
  + (header_field.description if header_field.description is not None else "")
@@ -921,10 +1037,10 @@ class Summarizer:
921
1037
 
922
1038
  adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
923
1039
  if sort_by_model_order:
924
- # Sort models by the order defined in the schema.
925
- # Models not defined in the schema will be sorted alphabetically and
926
- # placed before models in defined the schema.
927
- model_order = [model.name for model in self.schema.models]
1040
+ # Sort models by the order defined in the the model metadata config.
1041
+ # Models not defined in the model metadata config will be sorted alphabetically and
1042
+ # placed before models in defined the model metadata config.
1043
+ model_order = get_all_models()
928
1044
 
929
1045
  def _adapter_spec_sort_key(spec):
930
1046
  index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
@@ -948,21 +1064,18 @@ class Summarizer:
948
1064
  runs = adapter_to_runs[adapter_spec]
949
1065
  display_name = get_method_display_name(model_metadata.display_name, info)
950
1066
 
951
- # Link to all the runs under this model
952
- if link_to_runs:
1067
+ # Link the runs that this row was aggregated from, if this is a scenario table.
1068
+ # Scenario tables link to the runs in the model cells,
1069
+ # whereas non-scenario tables link to the runs in the metrics cells.
1070
+ run_spec_names: Optional[List[str]]
1071
+ if is_scenario_table:
953
1072
  run_spec_names = [run.run_spec.name for run in runs]
954
1073
  href = run_spec_names_to_url(run_spec_names)
955
1074
  else:
1075
+ run_spec_names = None
956
1076
  href = None
957
1077
 
958
- # Render contamination information
959
- point = self.contamination.get_point(model_name, columns[0][0].name)
960
- if num_groups == 1 and point is not None: # display contamination information at the adapter level
961
- cells = [
962
- Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
963
- ]
964
- else:
965
- cells = [Cell(display_name, description="", href=href)]
1078
+ cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
966
1079
  assert len(group_names) == len(matchers)
967
1080
  for group_name, matcher in zip(group_names, matchers):
968
1081
  group_runs = [run for run in runs if group_name in run.run_spec.groups]
@@ -971,23 +1084,7 @@ class Summarizer:
971
1084
  if "babi" in group_name and "task:" not in name:
972
1085
  group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
973
1086
 
974
- point = self.contamination.get_point(model_name, group_name)
975
- if point is not None:
976
- description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
977
- contamination_level = point.level
978
- else:
979
- description = ""
980
- contamination_level = None
981
-
982
- group_overlap_stats = None
983
- if (model_name, group_name) in self._model_group_overlap_stats:
984
- group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
985
-
986
- description = (
987
- f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
988
- f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
989
- f"{description}"
990
- )
1087
+ description = ""
991
1088
 
992
1089
  # HACK: we want to hide stats for the following model-metric combinations:
993
1090
  # 1. Calibration metrics + AI21/Anthropic
@@ -999,9 +1096,9 @@ class Summarizer:
999
1096
  self.create_cell(
1000
1097
  group_runs,
1001
1098
  matcher,
1002
- contamination_level,
1003
1099
  additional_info=description,
1004
1100
  hide_value=hide_value,
1101
+ is_scenario_table=is_scenario_table,
1005
1102
  )
1006
1103
  )
1007
1104
 
@@ -1011,12 +1108,12 @@ class Summarizer:
1011
1108
  # There could be a ton of runs, so only do this if there are 2-5
1012
1109
  # TODO: replace in frontend with a selector to choose which rows to visualize.
1013
1110
  links = []
1014
- if link_to_runs:
1111
+ if is_scenario_table:
1015
1112
  all_run_spec_names = []
1016
1113
  for adapter_spec, runs in adapter_to_runs.items():
1017
1114
  if len(runs) > 1:
1018
- hlog(
1019
- f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
1115
+ hwarn(
1116
+ f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
1020
1117
  f" {[run.run_spec.name for run in runs]}"
1021
1118
  )
1022
1119
  for run in runs:
@@ -1026,21 +1123,42 @@ class Summarizer:
1026
1123
 
1027
1124
  table = Table(title=title, header=header, rows=rows, links=links, name=name)
1028
1125
 
1029
- if add_win_rate:
1030
- # add overall win rate as the second column
1031
- WIN_RATE_AGGREGATION = "mean"
1032
- win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
1033
- description = "How many models this model outperform on average (over columns)."
1034
- table.header.insert(
1035
- AGGREGATE_WIN_RATE_COLUMN,
1036
- HeaderCell(
1037
- f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
1038
- description=description,
1039
- lower_is_better=False,
1040
- ),
1041
- )
1042
- for row, win_rate in zip(table.rows, win_rates):
1043
- row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
1126
+ aggregate_header_cells: List[HeaderCell] = []
1127
+ aggregate_row_values: List[List[Optional[float]]] = []
1128
+
1129
+ for strategy in aggregation_strategies:
1130
+ if strategy == AggregationStrategy.WIN_RATE:
1131
+ WIN_RATE_AGGREGATION = "mean"
1132
+ win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
1133
+ aggregate_header_cells.append(
1134
+ HeaderCell(
1135
+ f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
1136
+ description="How many models this model outperforms on average (over columns).",
1137
+ lower_is_better=False,
1138
+ )
1139
+ )
1140
+ aggregate_row_values.append(win_rates)
1141
+ elif strategy == AggregationStrategy.MEAN:
1142
+ means = compute_aggregate_row_means(table)
1143
+ aggregate_header_cells.append(
1144
+ HeaderCell(
1145
+ "Mean score",
1146
+ description="The mean of the scores from all columns.",
1147
+ lower_is_better=table.header[0].lower_is_better,
1148
+ )
1149
+ )
1150
+ aggregate_row_values.append(means)
1151
+ else:
1152
+ raise Exception(
1153
+ f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
1154
+ )
1155
+
1156
+ for i in range(len(aggregate_header_cells)):
1157
+ aggregate_header_cell = aggregate_header_cells[i]
1158
+ aggregate_rows = aggregate_row_values[i]
1159
+ table.header.insert(i + 1, aggregate_header_cell)
1160
+ for row, row_val in zip(table.rows, aggregate_rows):
1161
+ row.insert(i + 1, Cell(row_val))
1044
1162
 
1045
1163
  if bold_columns:
1046
1164
  for i, header_cell in enumerate(table.header):
@@ -1088,16 +1206,25 @@ class Summarizer:
1088
1206
 
1089
1207
  if len(adapter_to_runs) > 0:
1090
1208
  for metric_group in all_metric_groups:
1091
- display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
1209
+ metric_group_config = self.schema.name_to_metric_group[metric_group]
1210
+ display_name = metric_group_config.get_short_display_name()
1211
+ aggregate_strategies: List[str]
1212
+ if metric_group_config.aggregation_strategies is not None:
1213
+ aggregate_strategies = metric_group_config.aggregation_strategies
1214
+ elif metric_group_config.hide_win_rates:
1215
+ aggregate_strategies = []
1216
+ else:
1217
+ aggregate_strategies = [AggregationStrategy.WIN_RATE]
1092
1218
  table = self.create_group_table(
1093
1219
  name=metric_group,
1094
1220
  title=display_name,
1095
1221
  adapter_to_runs=adapter_to_runs,
1096
1222
  columns=[(subgroup, metric_group) for subgroup in subgroups],
1097
- link_to_runs=False,
1098
- add_win_rate=True,
1223
+ is_scenario_table=False,
1224
+ aggregation_strategies=aggregate_strategies,
1099
1225
  )
1100
- tables.append(table)
1226
+ if len(table.header) > 1:
1227
+ tables.append(table)
1101
1228
  return tables
1102
1229
 
1103
1230
  def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1126,7 +1253,7 @@ class Summarizer:
1126
1253
  name=scenario_name,
1127
1254
  adapter_to_runs=adapter_to_runs,
1128
1255
  columns=columns,
1129
- link_to_runs=True,
1256
+ is_scenario_table=True,
1130
1257
  )
1131
1258
  tables.append(table)
1132
1259
  scenarios_shown += 1
@@ -1138,7 +1265,7 @@ class Summarizer:
1138
1265
  name=f"{subgroup.name}:sub_split={sub_split}",
1139
1266
  adapter_to_runs=adapter_to_runs,
1140
1267
  columns=columns,
1141
- link_to_runs=False,
1268
+ is_scenario_table=False,
1142
1269
  sub_split=sub_split,
1143
1270
  )
1144
1271
  tables.append(table)
@@ -1158,7 +1285,7 @@ class Summarizer:
1158
1285
  name=subgroup.name,
1159
1286
  adapter_to_runs=adapter_to_runs,
1160
1287
  columns=columns,
1161
- link_to_runs=False,
1288
+ is_scenario_table=False,
1162
1289
  )
1163
1290
  tables = [table] + tables
1164
1291
  all_tables.extend(tables)
@@ -1225,72 +1352,6 @@ class Summarizer:
1225
1352
 
1226
1353
  parallel_map(process, self.runs, parallelism=self.num_threads)
1227
1354
 
1228
- def read_scenario_spec_instance_ids(self, num_instances) -> None:
1229
- """
1230
- This file checks if there exists a file, scenario_spec_instance_ids.json
1231
- that it can read the instance_ids associated with scenario_specs.
1232
-
1233
- It will write the num_instances used in the run as part of the file name
1234
-
1235
- If it doesn't exist, it will go through all the scenario_state files
1236
- and parse the instance_ids and output it to the file for future uses
1237
-
1238
- Only when the scenario_specs for the data overlap script change
1239
- (or num_instances are different), will this need to be rerun.
1240
-
1241
- In such cases, do not include the file as part of the data_overlap directory.
1242
- """
1243
- self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
1244
-
1245
- data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
1246
- if not os.path.isdir(data_overlap_dir):
1247
- hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
1248
- return
1249
-
1250
- scenario_spec_instance_ids_json = os.path.join(
1251
- data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
1252
- )
1253
- if not os.path.exists(scenario_spec_instance_ids_json):
1254
- hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
1255
- self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
1256
- else:
1257
- hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
1258
- scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
1259
-
1260
- for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
1261
- scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
1262
- scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
1263
- self.scenario_spec_instance_id_dict[
1264
- scenario_spec_instance_ids.scenario_spec
1265
- ] = scenario_spec_instance_ids.instance_ids
1266
-
1267
- def write_scenario_spec_instance_ids_json(self, file_path) -> None:
1268
- for run in self.runs:
1269
- run_spec = run.run_spec
1270
- scenario_spec = run_spec.scenario_spec
1271
- if scenario_spec in self.scenario_spec_instance_id_dict:
1272
- continue
1273
-
1274
- run_path = run.run_path
1275
- instances_file_path = os.path.join(run_path, "instances.json")
1276
- with open(instances_file_path, "r") as f:
1277
- raw_instances = json.load(f)
1278
-
1279
- # Optimization: Don't structure to dataclass, since we only need to read `id`
1280
- instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
1281
- self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
1282
-
1283
- all_scenario_spec_instance_ids = []
1284
- for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
1285
- scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
1286
- all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
1287
-
1288
- with open(file_path, "w") as f:
1289
- f.writelines(
1290
- f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
1291
- for scenario_spec_instance_ids in all_scenario_spec_instance_ids
1292
- )
1293
-
1294
1355
  def symlink_latest(self) -> None:
1295
1356
  # Create a symlink runs/latest -> runs/<name_of_suite>,
1296
1357
  # so runs/latest always points to the latest run suite.
@@ -1302,24 +1363,20 @@ class Summarizer:
1302
1363
  os.unlink(symlink_path)
1303
1364
  os.symlink(os.path.basename(self.run_release_path), symlink_path)
1304
1365
 
1305
- def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
1366
+ def run_pipeline(self, skip_completed: bool) -> None:
1306
1367
  """Run the entire summarization pipeline."""
1307
- self.write_schema()
1308
-
1309
1368
  self.read_runs()
1310
1369
  self.group_runs()
1311
- self.check_metrics_defined()
1312
1370
 
1313
- self.write_run_display_json(skip_completed)
1314
-
1315
- # Must happen after summarizer.write_run_display_json()
1316
- # because it uses instances.json files
1317
- self.read_scenario_spec_instance_ids(num_instances)
1371
+ ensure_directory_exists(self.run_release_path)
1318
1372
 
1319
- # Must happen after summarizer.read_scenario_spec_instance_ids()
1320
- # because it uses self.scenario_spec_instance_id_dict
1321
- self.read_overlap_stats()
1373
+ # Must happen after self.read_runs()
1374
+ # because it uses self.runs
1375
+ self.fix_up_schema()
1376
+ self.check_metrics_defined()
1377
+ self.write_schema()
1322
1378
 
1379
+ self.write_run_display_json(skip_completed)
1323
1380
  self.write_executive_summary()
1324
1381
  self.write_runs()
1325
1382
  self.write_run_specs()
@@ -1331,16 +1388,70 @@ class Summarizer:
1331
1388
 
1332
1389
 
1333
1390
  @htrack("summarize")
1391
+ def summarize(args):
1392
+ release: Optional[str] = None
1393
+ suites: Optional[str] = None
1394
+ suite: Optional[str] = None
1395
+ if args.suite and (args.release or args.suites):
1396
+ raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1397
+ elif args.suite:
1398
+ # Comment this out while we have a trial period for the `release` method.
1399
+ # hlog(
1400
+ # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1401
+ # "where --release specifies the name of a release and --suites specifies several run suites "
1402
+ # "to be included in that release."
1403
+ # )
1404
+ suite = args.suite
1405
+ elif args.release or args.suites:
1406
+ if not args.release or not args.suites:
1407
+ raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1408
+ release = args.release
1409
+ suites = args.suites
1410
+ else:
1411
+ raise ValueError("Exactly one of --release or --suite must be specified.")
1412
+
1413
+ schema_path: Optional[str]
1414
+ if args.auto_generate_schema:
1415
+ if args.schema_path:
1416
+ raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
1417
+ schema_path = None
1418
+ elif args.schema_path:
1419
+ schema_path = args.schema_path
1420
+ else:
1421
+ schema_path = get_default_schema_path()
1422
+
1423
+ register_builtin_configs_from_helm_package()
1424
+ register_configs_from_directory(args.local_path)
1425
+
1426
+ # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1427
+ summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1428
+ summarizer = summarizer_cls(
1429
+ release=release,
1430
+ suites=suites,
1431
+ suite=suite,
1432
+ schema_path=schema_path,
1433
+ output_path=args.output_path,
1434
+ verbose=args.debug,
1435
+ num_threads=args.num_threads,
1436
+ allow_unknown_models=args.allow_unknown_models,
1437
+ )
1438
+ summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1439
+ hlog("Done.")
1440
+
1441
+
1334
1442
  def main():
1335
1443
  parser = argparse.ArgumentParser()
1336
1444
  parser.add_argument(
1337
- "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
1445
+ "-o",
1446
+ "--output-path",
1447
+ type=str,
1448
+ help="Where the benchmarking output lives",
1449
+ default="benchmark_output",
1338
1450
  )
1339
1451
  parser.add_argument(
1340
- "--schema-file",
1452
+ "--schema-path",
1341
1453
  type=str,
1342
- help="File name of the schema to read (e.g., schema_classic.yaml).",
1343
- default=SCHEMA_CLASSIC_YAML_FILENAME,
1454
+ help="Path to the schema file (e.g., schema_classic.yaml).",
1344
1455
  )
1345
1456
  parser.add_argument(
1346
1457
  "--suite",
@@ -1353,9 +1464,18 @@ def main():
1353
1464
  help="Experimental: Name of the release this summarization should go under.",
1354
1465
  )
1355
1466
  parser.add_argument(
1356
- "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
1467
+ "--suites",
1468
+ type=str,
1469
+ nargs="+",
1470
+ help="Experimental: List of suites to summarize for this this release.",
1471
+ )
1472
+ parser.add_argument(
1473
+ "-n",
1474
+ "--num-threads",
1475
+ type=int,
1476
+ help="Max number of threads used to summarize",
1477
+ default=8,
1357
1478
  )
1358
- parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
1359
1479
  parser.add_argument(
1360
1480
  "--debug",
1361
1481
  action="store_true",
@@ -1366,12 +1486,6 @@ def main():
1366
1486
  action="store_true",
1367
1487
  help="Skip write_run_display_json() for runs which already have all output display JSON files",
1368
1488
  )
1369
- parser.add_argument(
1370
- "-num-instances",
1371
- type=int,
1372
- help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
1373
- default=1000,
1374
- )
1375
1489
  parser.add_argument(
1376
1490
  "--local-path",
1377
1491
  type=str,
@@ -1384,45 +1498,26 @@ def main():
1384
1498
  help="Whether to allow unknown models in the metadata file",
1385
1499
  default=True,
1386
1500
  )
1387
- args = parser.parse_args()
1388
-
1389
- release: Optional[str] = None
1390
- suites: Optional[str] = None
1391
- suite: Optional[str] = None
1392
- if args.suite and (args.release or args.suites):
1393
- raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1394
- elif args.suite:
1395
- # Comment this out while we have a trial period for the `release` method.
1396
- # hlog(
1397
- # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1398
- # "where --release specifies the name of a release and --suites specifies several run suites "
1399
- # "to be included in that release."
1400
- # )
1401
- suite = args.suite
1402
- elif args.release or args.suites:
1403
- if not args.release or not args.suites:
1404
- raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1405
- release = args.release
1406
- suites = args.suites
1407
- else:
1408
- raise ValueError("Exactly one of --release or --suite must be specified.")
1409
-
1410
- register_builtin_configs_from_helm_package()
1411
- register_configs_from_directory(args.local_path)
1412
-
1413
- # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1414
- summarizer = Summarizer(
1415
- release=release,
1416
- suites=suites,
1417
- suite=suite,
1418
- schema_file=args.schema_file,
1419
- output_path=args.output_path,
1420
- verbose=args.debug,
1421
- num_threads=args.num_threads,
1422
- allow_unknown_models=args.allow_unknown_models,
1501
+ parser.add_argument(
1502
+ "--summarizer-class-name",
1503
+ type=str,
1504
+ default=None,
1505
+ help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1423
1506
  )
1424
- summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
1425
- hlog("Done.")
1507
+ parser.add_argument(
1508
+ "--log-config",
1509
+ type=str,
1510
+ default=None,
1511
+ help="PATH to a YAML file to customize logging",
1512
+ )
1513
+ parser.add_argument(
1514
+ "--auto-generate-schema",
1515
+ action="store_true",
1516
+ help="EXPERIMENTAL: Auto-generate schema",
1517
+ )
1518
+ args = parser.parse_args()
1519
+ setup_default_logging(args.log_config)
1520
+ summarize(args)
1426
1521
 
1427
1522
 
1428
1523
  if __name__ == "__main__":