crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -9,12 +9,10 @@ Usage:
9
9
  """
10
10
 
11
11
  import argparse
12
- import cattrs
13
12
  import os
14
13
  import datetime
15
14
  import urllib.parse
16
15
  import json
17
- import yaml
18
16
  from collections import defaultdict
19
17
  from dataclasses import dataclass, replace
20
18
  from statistics import mean, median
@@ -35,8 +33,6 @@ from helm.common.codec import from_json
35
33
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
36
34
  from helm.benchmark.scenarios.scenario import ScenarioSpec
37
35
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
38
- from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
39
- from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
40
36
  from helm.benchmark.metrics.metric_name import MetricName
41
37
  from helm.benchmark.metrics.metric import get_all_stats_by_name
42
38
  from helm.benchmark.metrics.statistic import Stat, merge_stat
@@ -56,9 +52,10 @@ from helm.benchmark.presentation.schema import (
56
52
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
57
53
  from helm.benchmark.presentation.run_display import write_run_display_json
58
54
  from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
55
+ from helm.common.object_spec import get_class_by_name
59
56
 
60
57
 
61
- OVERLAP_N_COUNT = 13
58
+ MODEL_HEADER_CELL_VALUE = "Model"
62
59
 
63
60
 
64
61
  @dataclass(frozen=True)
@@ -226,17 +223,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
226
223
  """
227
224
  assert aggregation in ["mean", "median"]
228
225
  win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
229
- for i, header_cell in enumerate(table.header):
226
+ for column_index, header_cell in enumerate(table.header):
230
227
  lower_is_better = header_cell.lower_is_better
231
228
  if lower_is_better is None: # column does not have a meaningful ordering
232
229
  continue
233
-
234
- values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
235
- if len(values) < 2: # don't rank a single model
230
+ value_to_count: Dict[float, int] = defaultdict(int)
231
+ for row in table.rows:
232
+ value = row[column_index].value
233
+ if value is not None:
234
+ value_to_count[value] += 1
235
+ value_to_wins: Dict[float, float] = {}
236
+ acc_count = 0
237
+ for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
238
+ value_to_wins[value] = acc_count + ((value_count - 1) / 2)
239
+ acc_count += value_count
240
+ total_count = acc_count
241
+ if total_count < 2:
236
242
  continue
237
- for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
238
- win_rate = wins / (len(values) - 1) # normalize to [0, 1]
239
- win_rates_per_row[j].append(win_rate)
243
+ for row_index, row in enumerate(table.rows):
244
+ value = row[column_index].value
245
+ if value is not None:
246
+ win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
240
247
 
241
248
  # Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
242
249
  # Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
@@ -251,7 +258,48 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
251
258
  return aggregate_win_rates
252
259
 
253
260
 
254
- AGGREGATE_WIN_RATE_COLUMN = 1
261
+ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
262
+ """
263
+ Computes the aggregate mean of each row across columns.
264
+ Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
265
+ non-null values of the row are in columns we skip).
266
+ """
267
+
268
+ row_means: List[Optional[float]] = []
269
+ # if the first column contains the names of models, do not treat it like a value column
270
+ skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
271
+
272
+ # check for all header cells where specified, that lower_is_better is consistent
273
+ orderings = []
274
+ header_cells = table.header[1:] if skip_first_column else table.header
275
+ for header_cell in header_cells:
276
+ orderings.append(header_cell.lower_is_better)
277
+ if len(set(orderings)) != 1:
278
+ raise Exception("Cannot mean columns with different values for lower_is_better")
279
+
280
+ for row in table.rows:
281
+ total = 0.0
282
+ count = 0
283
+ row_cells = row[1:] if skip_first_column else row
284
+ for cell in row_cells:
285
+ if cell.value is not None:
286
+ total += float(cell.value)
287
+ count += 1
288
+ if count == 0:
289
+ row_means.append(None)
290
+ else:
291
+ row_means.append(total / count)
292
+
293
+ return row_means
294
+
295
+
296
+ class AggregationStrategy:
297
+ # TODO: Convert to StrEnum after upgrading to Python 3.11
298
+ WIN_RATE = "win_rate"
299
+ MEAN = "mean"
300
+
301
+
302
+ ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
255
303
 
256
304
 
257
305
  class Summarizer:
@@ -483,137 +531,6 @@ class Summarizer:
483
531
  for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
484
532
  self.read_runs_for_suite(suite, run_suite_path)
485
533
 
486
- def read_overlap_stats(self):
487
- """
488
- Load the overlap stats in the run suite path.
489
- Concretely:
490
- - get group -> scenario_spec information from self.runs
491
- run_spec data
492
- - read the files in the data_overlap directory in run_suite_path
493
- which are scenario_spec -> overlap ids
494
- - get aggregate stats for group -> overlap ratio
495
- """
496
-
497
- def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
498
- scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
499
- for run_spec in run_specs:
500
- scenario_spec = run_spec.scenario_spec
501
- groups = run_spec.groups
502
- if (
503
- scenario_spec.class_name
504
- != "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
505
- ):
506
- scenario_specs_to_groups[scenario_spec] = groups
507
-
508
- group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
509
- for scenario_spec, groups in scenario_specs_to_groups.items():
510
- for group in groups:
511
- if group not in group_to_scenario_specs:
512
- group_to_scenario_specs[group] = []
513
- group_to_scenario_specs[group].append(scenario_spec)
514
- return group_to_scenario_specs
515
-
516
- def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
517
- """
518
- Takes the data_overlap_dir as input and returns a dictionary
519
- of stats_file_path -> List(model_names)
520
-
521
- Sample input:
522
- file_models_mapping:
523
- - file_name: file1
524
- model_names:
525
- - model1
526
- - model2
527
- - file_name: file2
528
- model_names:
529
- - model2
530
- - model3
531
-
532
- """
533
- metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
534
- if not os.path.exists(metadata_file_path):
535
- return {}
536
-
537
- with open(metadata_file_path, "r") as yaml_file:
538
- data = yaml.safe_load(yaml_file)
539
-
540
- file_metadata: Dict[str, List[str]] = {}
541
- for entry in data["file_models_mapping"]:
542
- if "file_name" in entry and "model_names" in entry:
543
- file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
544
- file_metadata[file_path] = entry["model_names"]
545
-
546
- return file_metadata
547
-
548
- # TODO: Delete this after @andyzorigin's project is done.
549
- self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
550
-
551
- data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
552
- if not os.path.isdir(data_overlap_dir):
553
- hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
554
- return
555
-
556
- group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
557
-
558
- stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
559
-
560
- for file_path, model_names in stats_file_metadata.items():
561
- overlap_stats_jsons = open(file_path, "r").readlines()
562
-
563
- data_overlap_stats_list: List[DataOverlapStats] = []
564
- for overlap_stats_json in overlap_stats_jsons:
565
- overlap_stats_dict = json.loads(overlap_stats_json)
566
- data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
567
-
568
- scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
569
- for data_overlap_stats in data_overlap_stats_list:
570
- data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
571
- n = data_overlap_stats_key.overlap_protocol_spec.n
572
- if n == OVERLAP_N_COUNT:
573
- light_scenario_key = data_overlap_stats_key.light_scenario_key
574
- scenario_spec = light_scenario_key.scenario_spec
575
- if scenario_spec in self.scenario_spec_instance_id_dict:
576
- # Get statistics based on the subset of instance_ids that HELM uses for a scenario
577
- instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
578
- num_instances = len(instance_ids)
579
- num_overlapping_inputs = len(
580
- set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
581
- )
582
- num_overlapping_references = len(
583
- set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
584
- )
585
- scenario_spec_overlap_counts[scenario_spec] = (
586
- num_instances,
587
- num_overlapping_inputs,
588
- num_overlapping_references,
589
- )
590
-
591
- for group, scenario_specs in group_to_scenario_specs.items():
592
- group_num_instances = 0
593
- group_num_overlapping_inputs = 0
594
- group_num_overlapping_references = 0
595
- for scenario_spec in scenario_specs:
596
- if scenario_spec in scenario_spec_overlap_counts:
597
- (
598
- num_instances,
599
- num_overlapping_inputs,
600
- num_overlapping_references,
601
- ) = scenario_spec_overlap_counts[scenario_spec]
602
- group_num_instances += num_instances
603
- group_num_overlapping_inputs += num_overlapping_inputs
604
- group_num_overlapping_references += num_overlapping_references
605
- if group_num_instances != 0:
606
- group_overlap_stats = GroupOverlapStats(
607
- group=group,
608
- num_instances=group_num_instances,
609
- num_overlapping_inputs=group_num_overlapping_inputs,
610
- num_overlapping_references=group_num_overlapping_references,
611
- )
612
- for model_name in model_names:
613
- # Assume model name will only be associated with single group overlap list for now
614
- # can update to join lists if need arises
615
- self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
616
-
617
534
  @htrack(None)
618
535
  def check_metrics_defined(self):
619
536
  """Check that all the metrics that appear in stats are defined."""
@@ -880,7 +797,7 @@ class Summarizer:
880
797
  sort_by_model_order: bool = True,
881
798
  sub_split: Optional[str] = None,
882
799
  bold_columns: bool = True,
883
- add_win_rate: bool = False,
800
+ aggregation_strategies: List[str] = [],
884
801
  ) -> Table:
885
802
  """
886
803
  Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -902,7 +819,7 @@ class Summarizer:
902
819
  num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
903
820
 
904
821
  # Column headers
905
- header.append(HeaderCell("Model/adapter"))
822
+ header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
906
823
  for run_group, metric_group_name in columns:
907
824
  # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
908
825
  if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -1016,16 +933,6 @@ class Summarizer:
1016
933
 
1017
934
  description = ""
1018
935
 
1019
- group_overlap_stats = None
1020
- if (model_name, group_name) in self._model_group_overlap_stats:
1021
- group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
1022
-
1023
- description = (
1024
- f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
1025
- f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
1026
- f"{description}"
1027
- )
1028
-
1029
936
  # HACK: we want to hide stats for the following model-metric combinations:
1030
937
  # 1. Calibration metrics + AI21/Anthropic
1031
938
  # 2. MSMARCO metrics + AI21/Anthropic
@@ -1063,21 +970,42 @@ class Summarizer:
1063
970
 
1064
971
  table = Table(title=title, header=header, rows=rows, links=links, name=name)
1065
972
 
1066
- if add_win_rate:
1067
- # add overall win rate as the second column
1068
- WIN_RATE_AGGREGATION = "mean"
1069
- win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
1070
- description = "How many models this model outperform on average (over columns)."
1071
- table.header.insert(
1072
- AGGREGATE_WIN_RATE_COLUMN,
1073
- HeaderCell(
1074
- f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
1075
- description=description,
1076
- lower_is_better=False,
1077
- ),
1078
- )
1079
- for row, win_rate in zip(table.rows, win_rates):
1080
- row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
973
+ aggregate_header_cells: List[HeaderCell] = []
974
+ aggregate_row_values: List[List[Optional[float]]] = []
975
+
976
+ for strategy in aggregation_strategies:
977
+ if strategy == AggregationStrategy.WIN_RATE:
978
+ WIN_RATE_AGGREGATION = "mean"
979
+ win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
980
+ aggregate_header_cells.append(
981
+ HeaderCell(
982
+ f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
983
+ description="How many models this model outperforms on average (over columns).",
984
+ lower_is_better=False,
985
+ )
986
+ )
987
+ aggregate_row_values.append(win_rates)
988
+ elif strategy == AggregationStrategy.MEAN:
989
+ means = compute_aggregate_row_means(table)
990
+ aggregate_header_cells.append(
991
+ HeaderCell(
992
+ "Mean score",
993
+ description="The mean of the scores from all columns.",
994
+ lower_is_better=table.header[0].lower_is_better,
995
+ )
996
+ )
997
+ aggregate_row_values.append(means)
998
+ else:
999
+ raise Exception(
1000
+ f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
1001
+ )
1002
+
1003
+ for i in range(len(aggregate_header_cells)):
1004
+ aggregate_header_cell = aggregate_header_cells[i]
1005
+ aggregate_rows = aggregate_row_values[i]
1006
+ table.header.insert(i + 1, aggregate_header_cell)
1007
+ for row, row_val in zip(table.rows, aggregate_rows):
1008
+ row.insert(i + 1, Cell(row_val))
1081
1009
 
1082
1010
  if bold_columns:
1083
1011
  for i, header_cell in enumerate(table.header):
@@ -1125,14 +1053,22 @@ class Summarizer:
1125
1053
 
1126
1054
  if len(adapter_to_runs) > 0:
1127
1055
  for metric_group in all_metric_groups:
1128
- display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
1056
+ metric_group_config = self.schema.name_to_metric_group[metric_group]
1057
+ display_name = metric_group_config.get_short_display_name()
1058
+ aggregate_strategies: List[str]
1059
+ if metric_group_config.aggregation_strategies is not None:
1060
+ aggregate_strategies = metric_group_config.aggregation_strategies
1061
+ elif metric_group_config.hide_win_rates:
1062
+ aggregate_strategies = []
1063
+ else:
1064
+ aggregate_strategies = [AggregationStrategy.WIN_RATE]
1129
1065
  table = self.create_group_table(
1130
1066
  name=metric_group,
1131
1067
  title=display_name,
1132
1068
  adapter_to_runs=adapter_to_runs,
1133
1069
  columns=[(subgroup, metric_group) for subgroup in subgroups],
1134
1070
  is_scenario_table=False,
1135
- add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
1071
+ aggregation_strategies=aggregate_strategies,
1136
1072
  )
1137
1073
  tables.append(table)
1138
1074
  return tables
@@ -1262,72 +1198,6 @@ class Summarizer:
1262
1198
 
1263
1199
  parallel_map(process, self.runs, parallelism=self.num_threads)
1264
1200
 
1265
- def read_scenario_spec_instance_ids(self, num_instances) -> None:
1266
- """
1267
- This file checks if there exists a file, scenario_spec_instance_ids.json
1268
- that it can read the instance_ids associated with scenario_specs.
1269
-
1270
- It will write the num_instances used in the run as part of the file name
1271
-
1272
- If it doesn't exist, it will go through all the scenario_state files
1273
- and parse the instance_ids and output it to the file for future uses
1274
-
1275
- Only when the scenario_specs for the data overlap script change
1276
- (or num_instances are different), will this need to be rerun.
1277
-
1278
- In such cases, do not include the file as part of the data_overlap directory.
1279
- """
1280
- self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
1281
-
1282
- data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
1283
- if not os.path.isdir(data_overlap_dir):
1284
- hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
1285
- return
1286
-
1287
- scenario_spec_instance_ids_json = os.path.join(
1288
- data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
1289
- )
1290
- if not os.path.exists(scenario_spec_instance_ids_json):
1291
- hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
1292
- self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
1293
- else:
1294
- hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
1295
- scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
1296
-
1297
- for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
1298
- scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
1299
- scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
1300
- self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
1301
- scenario_spec_instance_ids.instance_ids
1302
- )
1303
-
1304
- def write_scenario_spec_instance_ids_json(self, file_path) -> None:
1305
- for run in self.runs:
1306
- run_spec = run.run_spec
1307
- scenario_spec = run_spec.scenario_spec
1308
- if scenario_spec in self.scenario_spec_instance_id_dict:
1309
- continue
1310
-
1311
- run_path = run.run_path
1312
- instances_file_path = os.path.join(run_path, "instances.json")
1313
- with open(instances_file_path, "r") as f:
1314
- raw_instances = json.load(f)
1315
-
1316
- # Optimization: Don't structure to dataclass, since we only need to read `id`
1317
- instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
1318
- self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
1319
-
1320
- all_scenario_spec_instance_ids = []
1321
- for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
1322
- scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
1323
- all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
1324
-
1325
- with open(file_path, "w") as f:
1326
- f.writelines(
1327
- f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
1328
- for scenario_spec_instance_ids in all_scenario_spec_instance_ids
1329
- )
1330
-
1331
1201
  def symlink_latest(self) -> None:
1332
1202
  # Create a symlink runs/latest -> runs/<name_of_suite>,
1333
1203
  # so runs/latest always points to the latest run suite.
@@ -1339,7 +1209,7 @@ class Summarizer:
1339
1209
  os.unlink(symlink_path)
1340
1210
  os.symlink(os.path.basename(self.run_release_path), symlink_path)
1341
1211
 
1342
- def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
1212
+ def run_pipeline(self, skip_completed: bool) -> None:
1343
1213
  """Run the entire summarization pipeline."""
1344
1214
  self.read_runs()
1345
1215
  self.group_runs()
@@ -1347,14 +1217,6 @@ class Summarizer:
1347
1217
 
1348
1218
  self.write_run_display_json(skip_completed)
1349
1219
 
1350
- # Must happen after summarizer.write_run_display_json()
1351
- # because it uses instances.json files
1352
- self.read_scenario_spec_instance_ids(num_instances)
1353
-
1354
- # Must happen after summarizer.read_scenario_spec_instance_ids()
1355
- # because it uses self.scenario_spec_instance_id_dict
1356
- self.read_overlap_stats()
1357
-
1358
1220
  # Must happen after self.read_runs()
1359
1221
  # because it uses self.runs
1360
1222
  self.write_schema()
@@ -1404,12 +1266,6 @@ def main():
1404
1266
  action="store_true",
1405
1267
  help="Skip write_run_display_json() for runs which already have all output display JSON files",
1406
1268
  )
1407
- parser.add_argument(
1408
- "-num-instances",
1409
- type=int,
1410
- help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
1411
- default=1000,
1412
- )
1413
1269
  parser.add_argument(
1414
1270
  "--local-path",
1415
1271
  type=str,
@@ -1422,6 +1278,12 @@ def main():
1422
1278
  help="Whether to allow unknown models in the metadata file",
1423
1279
  default=True,
1424
1280
  )
1281
+ parser.add_argument(
1282
+ "--summarizer-class-name",
1283
+ type=str,
1284
+ default=None,
1285
+ help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1286
+ )
1425
1287
  args = parser.parse_args()
1426
1288
 
1427
1289
  release: Optional[str] = None
@@ -1451,7 +1313,8 @@ def main():
1451
1313
  register_configs_from_directory(args.local_path)
1452
1314
 
1453
1315
  # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1454
- summarizer = Summarizer(
1316
+ summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1317
+ summarizer = summarizer_cls(
1455
1318
  release=release,
1456
1319
  suites=suites,
1457
1320
  suite=suite,
@@ -1461,7 +1324,7 @@ def main():
1461
1324
  num_threads=args.num_threads,
1462
1325
  allow_unknown_models=args.allow_unknown_models,
1463
1326
  )
1464
- summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
1327
+ summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1465
1328
  hlog("Done.")
1466
1329
 
1467
1330
 
@@ -1,8 +1,9 @@
1
1
  import os
2
2
  import tempfile
3
3
 
4
- from helm.benchmark.presentation.summarize import Summarizer
4
+ from helm.benchmark.presentation.summarize import Summarizer, compute_aggregate_row_win_rates
5
5
  from helm.benchmark.presentation.schema import get_default_schema_path
6
+ from helm.benchmark.presentation.table import Cell, HeaderCell, Table
6
7
  from helm.common.general import ensure_directory_exists
7
8
 
8
9
 
@@ -19,7 +20,7 @@ def test_summarize_suite():
19
20
  num_threads=4,
20
21
  allow_unknown_models=True,
21
22
  )
22
- summarizer.run_pipeline(skip_completed=True, num_instances=1000)
23
+ summarizer.run_pipeline(skip_completed=True)
23
24
  assert os.path.isfile(os.path.join(output_path, "runs", "test_suite", "groups.json"))
24
25
 
25
26
 
@@ -37,5 +38,146 @@ def test_summarize_release():
37
38
  num_threads=4,
38
39
  allow_unknown_models=True,
39
40
  )
40
- summarizer.run_pipeline(skip_completed=True, num_instances=1000)
41
+ summarizer.run_pipeline(skip_completed=True)
41
42
  assert os.path.isfile(os.path.join(output_path, "releases", "test_release", "groups.json"))
43
+
44
+
45
+ def test_compute_win_rates_one_scenario():
46
+ header = [
47
+ HeaderCell(value="Model"),
48
+ HeaderCell(value="Scenario A", lower_is_better=False),
49
+ ]
50
+ values = [
51
+ ["Model A", 1],
52
+ ["Model B", 2],
53
+ ["Model C", 3],
54
+ ["Model D", 4],
55
+ ["Model E", 5],
56
+ ]
57
+ rows = [[Cell(value) for value in row_values] for row_values in values]
58
+ table = Table(title="Test Table", header=header, rows=rows)
59
+ assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
60
+
61
+
62
+ def test_compute_win_rates_two_scenarios():
63
+ header = [
64
+ HeaderCell(value="Model"),
65
+ HeaderCell(value="Scenario A", lower_is_better=False),
66
+ HeaderCell(value="Scenario B", lower_is_better=False),
67
+ ]
68
+ values = [
69
+ ["Model A", 1, 3],
70
+ ["Model B", 2, 1],
71
+ ["Model C", 3, 2],
72
+ ["Model D", 4, 5],
73
+ ["Model E", 5, 4],
74
+ ]
75
+ rows = [[Cell(value) for value in row_values] for row_values in values]
76
+ table = Table(title="Test Table", header=header, rows=rows)
77
+ assert compute_aggregate_row_win_rates(table) == [0.25, 0.125, 0.375, 0.875, 0.875]
78
+
79
+
80
+ def test_compute_win_rates_incomplete_values():
81
+ header = [
82
+ HeaderCell(value="Model"),
83
+ HeaderCell(value="Scenario A", lower_is_better=False),
84
+ HeaderCell(value="Scenario B", lower_is_better=False),
85
+ ]
86
+ values = [
87
+ ["Model A", 1, 3],
88
+ ["Model B", 2, 1],
89
+ ["Model C", 3, None],
90
+ ["Model D", 4, None],
91
+ ["Model E", 5, None],
92
+ ]
93
+ rows = [[Cell(value) for value in row_values] for row_values in values]
94
+ table = Table(title="Test Table", header=header, rows=rows)
95
+ assert compute_aggregate_row_win_rates(table) == [0.5, 0.125, 0.5, 0.75, 1]
96
+
97
+
98
+ def test_compute_win_rates_ignore_nones():
99
+ header = [
100
+ HeaderCell(value="Model"),
101
+ HeaderCell(value="Scenario A", lower_is_better=False),
102
+ HeaderCell(value="Scenario B", lower_is_better=False),
103
+ HeaderCell(value="Scenario C", lower_is_better=False),
104
+ ]
105
+ values = [
106
+ ["Model A", 1, None, None],
107
+ ["Model B", 2, None, 1],
108
+ ["Model C", 3, None, None],
109
+ ["Model D", 4, None, None],
110
+ ["Model E", 5, None, None],
111
+ ]
112
+ rows = [[Cell(value) for value in row_values] for row_values in values]
113
+ table = Table(title="Test Table", header=header, rows=rows)
114
+ assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
115
+
116
+
117
+ def test_compute_win_rates_ignore_unset_lower_is_better():
118
+ header = [
119
+ HeaderCell(value="Model"),
120
+ HeaderCell(value="Scenario A", lower_is_better=False),
121
+ HeaderCell(value="Scenario B"),
122
+ ]
123
+ values = [
124
+ ["Model A", 1, 3],
125
+ ["Model B", 2, 1],
126
+ ["Model C", 3, 2],
127
+ ["Model D", 4, 5],
128
+ ["Model E", 5, 4],
129
+ ]
130
+ rows = [[Cell(value) for value in row_values] for row_values in values]
131
+ table = Table(title="Test Table", header=header, rows=rows)
132
+ assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
133
+
134
+
135
+ def test_compute_win_rates_no_win_rate():
136
+ header = [
137
+ HeaderCell(value="Model"),
138
+ HeaderCell(value="Scenario A", lower_is_better=False),
139
+ ]
140
+ values = [
141
+ ["Model A", None],
142
+ ["Model B", None],
143
+ ["Model C", None],
144
+ ["Model D", None],
145
+ ["Model E", None],
146
+ ]
147
+ rows = [[Cell(value) for value in row_values] for row_values in values]
148
+ table = Table(title="Test Table", header=header, rows=rows)
149
+ assert compute_aggregate_row_win_rates(table) == [None, None, None, None, None]
150
+
151
+
152
+ def test_compute_win_rates_ties():
153
+ header = [
154
+ HeaderCell(value="Model"),
155
+ HeaderCell(value="Scenario A", lower_is_better=False),
156
+ ]
157
+ values = [
158
+ ["Model A", 1],
159
+ ["Model B", 1],
160
+ ["Model C", 1],
161
+ ["Model D", 4],
162
+ ["Model E", 5],
163
+ ]
164
+ rows = [[Cell(value) for value in row_values] for row_values in values]
165
+ table = Table(title="Test Table", header=header, rows=rows)
166
+ assert compute_aggregate_row_win_rates(table) == [0.25, 0.25, 0.25, 0.75, 1.0]
167
+
168
+
169
+ def test_compute_win_rates_lower_is_better():
170
+ header = [
171
+ HeaderCell(value="Model"),
172
+ HeaderCell(value="Scenario A", lower_is_better=True),
173
+ ]
174
+ values = [
175
+ ["Model A", 1],
176
+ ["Model B", 2],
177
+ ["Model C", 3],
178
+ ["Model D", 4],
179
+ ["Model E", 5],
180
+ ]
181
+ rows = [[Cell(value) for value in row_values] for row_values in values]
182
+ table = Table(title="Test Table", header=header, rows=rows)
183
+ assert compute_aggregate_row_win_rates(table) == [1, 0.75, 0.5, 0.25, 0]