crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -2,12 +2,12 @@ from typing import Dict
2
2
 
3
3
  from helm.benchmark.metrics.metric_service import MetricService
4
4
  from helm.common.request import Request
5
- from .ai21_token_cost_estimator import AI21TokenCostEstimator
6
- from .cohere_token_cost_estimator import CohereTokenCostEstimator
7
- from .free_token_cost_estimator import FreeTokenCostEstimator
8
- from .gooseai_token_cost_estimator import GooseAITokenCostEstimator
9
- from .openai_token_cost_estimator import OpenAITokenCostEstimator
10
- from .token_cost_estimator import TokenCostEstimator
5
+ from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
6
+ from helm.benchmark.metrics.tokens.cohere_token_cost_estimator import CohereTokenCostEstimator
7
+ from helm.benchmark.metrics.tokens.free_token_cost_estimator import FreeTokenCostEstimator
8
+ from helm.benchmark.metrics.tokens.gooseai_token_cost_estimator import GooseAITokenCostEstimator
9
+ from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
10
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
11
11
 
12
12
 
13
13
  class AutoTokenCostEstimator(TokenCostEstimator):
@@ -1,6 +1,6 @@
1
1
  from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
- from .token_cost_estimator import TokenCostEstimator
3
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
4
4
 
5
5
 
6
6
  class CohereTokenCostEstimator(TokenCostEstimator):
@@ -1,6 +1,6 @@
1
1
  from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
- from .token_cost_estimator import TokenCostEstimator
3
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
4
4
 
5
5
 
6
6
  class FreeTokenCostEstimator(TokenCostEstimator):
@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.benchmark.window_services.window_service import WindowService
3
3
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
4
4
  from helm.common.request import Request
5
- from .token_cost_estimator import TokenCostEstimator
5
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
6
6
 
7
7
 
8
8
  class GooseAITokenCostEstimator(TokenCostEstimator):
@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.common.request import Request
3
3
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
4
4
  from helm.benchmark.window_services.window_service import WindowService
5
- from .token_cost_estimator import TokenCostEstimator
5
+ from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
6
6
 
7
7
 
8
8
  class OpenAITokenCostEstimator(TokenCostEstimator):
@@ -1,7 +1,7 @@
1
1
  from unittest.mock import Mock
2
2
 
3
3
  from helm.common.request import Request
4
- from .ai21_token_cost_estimator import AI21TokenCostEstimator
4
+ from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
5
5
 
6
6
 
7
7
  class TestAI21TokenCostEstimator:
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
8
8
  from helm.common.request import Request
9
9
  from helm.common.tokenization_request import TokenizationRequestResult, TokenizationToken
10
10
  from helm.proxy.services.remote_service import RemoteService
11
- from .openai_token_cost_estimator import OpenAITokenCostEstimator
11
+ from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
12
12
 
13
13
 
14
14
  class TestOpenAITokenCostEstimator:
@@ -6,10 +6,10 @@ from helm.common.hierarchical_logger import hlog
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
8
  from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
9
- from .metric import Metric
10
- from .metric_name import MetricName
11
- from .metric_service import MetricService
12
- from .statistic import Stat
9
+ from helm.benchmark.metrics.metric import Metric
10
+ from helm.benchmark.metrics.metric_name import MetricName
11
+ from helm.benchmark.metrics.metric_service import MetricService
12
+ from helm.benchmark.metrics.statistic import Stat
13
13
 
14
14
 
15
15
  class ToxicityMetric(Metric):
@@ -18,7 +18,10 @@ class UnitxtMetric(MetricInterface):
18
18
 
19
19
  def __init__(self, **kwargs):
20
20
  super().__init__()
21
- dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
21
+ if len(kwargs) == 1 and "recipe" in kwargs:
22
+ dataset_name = kwargs["recipe"]
23
+ else:
24
+ dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
22
25
  self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
23
26
 
24
27
  def evaluate(
@@ -324,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
324
324
 
325
325
  # Compute the LPIPS score
326
326
  assert self._lpips_metric is not None
327
- score: float = self._lpips_metric(img1, img2).detach().item()
327
+ score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
328
328
  return score
329
329
 
330
330
  def _calculate_fid(self, act1, act2):
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class WildBenchScoreMetric(Metric):
12
+ """Score metrics for WildBench."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ annotations: Dict[str, Any] = request_state.annotations["wildbench"]
23
+ scores: List[float] = []
24
+ for annotation_key, annotation_value in annotations.items():
25
+ if annotation_key.endswith("_score") and annotation_value is not None:
26
+ scores.append(annotation_value)
27
+ if not scores:
28
+ raise ValueError("Could not compute WB Score because all annotators failed.")
29
+ score = sum(scores) / len(scores)
30
+ score_rescaled = (score - 1) / 9
31
+ return [
32
+ Stat(MetricName("wildbench_score")).add(score),
33
+ Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
34
+ ]
@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
22
22
  # OpenAI Chat format
23
23
  OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
24
24
 
25
+ # For NOVA models
26
+ NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
27
+
25
28
  # For Anthropic models
26
29
  ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
27
30
  ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
66
69
  LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
67
70
  FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
68
71
 
72
+ # For Audio-langauge models (AudioLMs)
73
+ AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
74
+
69
75
  # Deprecated models that are no longer available.
70
76
  # These are usually closed API models that have been permanently removed
71
77
  DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
72
78
 
79
+ # Unsupported models.
80
+ # These are models that we have chosen not to support because they are
81
+ # private, stale, non-notable, or difficult to implement.
82
+ UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
83
+
73
84
  # Frozen is set to false as the model_deployment_registry.py file
74
85
  # might populate the deployment_names field.
75
86
 
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
208
219
  return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
209
220
 
210
221
 
222
+ def is_audiolm(model_name: str) -> bool:
223
+ """Returns True if the model is a audio-language model (AudioLM). False otherwise."""
224
+ return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
225
+
226
+
211
227
  def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
212
228
  """Return placeholder ModelMetadata for an unknown model."""
213
229
  return ModelMetadata(
@@ -52,6 +52,10 @@ from helm.benchmark.presentation.schema import (
52
52
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
53
53
  from helm.benchmark.presentation.run_display import write_run_display_json
54
54
  from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
55
+ from helm.common.object_spec import get_class_by_name
56
+
57
+
58
+ MODEL_HEADER_CELL_VALUE = "Model"
55
59
 
56
60
 
57
61
  @dataclass(frozen=True)
@@ -262,18 +266,22 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
262
266
  """
263
267
 
264
268
  row_means: List[Optional[float]] = []
269
+ # if the first column contains the names of models, do not treat it like a value column
270
+ skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
265
271
 
266
272
  # check for all header cells where specified, that lower_is_better is consistent
267
273
  orderings = []
268
- for elem in table.header:
269
- orderings.append(elem.lower_is_better)
274
+ header_cells = table.header[1:] if skip_first_column else table.header
275
+ for header_cell in header_cells:
276
+ orderings.append(header_cell.lower_is_better)
270
277
  if len(set(orderings)) != 1:
271
278
  raise Exception("Cannot mean columns with different values for lower_is_better")
272
279
 
273
280
  for row in table.rows:
274
281
  total = 0.0
275
282
  count = 0
276
- for cell in row:
283
+ row_cells = row[1:] if skip_first_column else row
284
+ for cell in row_cells:
277
285
  if cell.value is not None:
278
286
  total += float(cell.value)
279
287
  count += 1
@@ -811,7 +819,7 @@ class Summarizer:
811
819
  num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
812
820
 
813
821
  # Column headers
814
- header.append(HeaderCell("Model/adapter"))
822
+ header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
815
823
  for run_group, metric_group_name in columns:
816
824
  # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
817
825
  if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -969,22 +977,20 @@ class Summarizer:
969
977
  if strategy == AggregationStrategy.WIN_RATE:
970
978
  WIN_RATE_AGGREGATION = "mean"
971
979
  win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
972
- description = "How many models this model outperforms on average (over columns)."
973
980
  aggregate_header_cells.append(
974
981
  HeaderCell(
975
982
  f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
976
- description=description,
983
+ description="How many models this model outperforms on average (over columns).",
977
984
  lower_is_better=False,
978
985
  )
979
986
  )
980
987
  aggregate_row_values.append(win_rates)
981
988
  elif strategy == AggregationStrategy.MEAN:
982
989
  means = compute_aggregate_row_means(table)
983
- description = "An average over columns representing the mean performance."
984
990
  aggregate_header_cells.append(
985
991
  HeaderCell(
986
- "Mean performance",
987
- description=description,
992
+ "Mean score",
993
+ description="The mean of the scores from all columns.",
988
994
  lower_is_better=table.header[0].lower_is_better,
989
995
  )
990
996
  )
@@ -1272,6 +1278,12 @@ def main():
1272
1278
  help="Whether to allow unknown models in the metadata file",
1273
1279
  default=True,
1274
1280
  )
1281
+ parser.add_argument(
1282
+ "--summarizer-class-name",
1283
+ type=str,
1284
+ default=None,
1285
+ help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1286
+ )
1275
1287
  args = parser.parse_args()
1276
1288
 
1277
1289
  release: Optional[str] = None
@@ -1301,7 +1313,8 @@ def main():
1301
1313
  register_configs_from_directory(args.local_path)
1302
1314
 
1303
1315
  # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1304
- summarizer = Summarizer(
1316
+ summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1317
+ summarizer = summarizer_cls(
1305
1318
  release=release,
1306
1319
  suites=suites,
1307
1320
  suite=suite,
@@ -0,0 +1,178 @@
1
+ import os
2
+ from typing import Dict, List, Optional
3
+
4
+ from helm.benchmark.metrics.metric import PerInstanceStats
5
+ from helm.benchmark.presentation.schema import MetricNameMatcher, RunGroup
6
+ from helm.benchmark.presentation.summarize import Run, Summarizer
7
+ from helm.benchmark.presentation.table import Cell
8
+ from helm.common.codec import from_json
9
+
10
+
11
+ class ToRRRobustnessSummarizer(Summarizer):
12
+ """A Summarizer that computes robustness metrics.
13
+
14
+ This Summarizer computes a robustness metrics based on the definition in the ToRR paper.
15
+ The instance-level robustness score for a given model and instance is defined as
16
+ 1 - (max_score - min_score) where max_score and min_scores are the maximum and minimum
17
+ scores for the model on that instance across all runs (i.e. across all augmentations
18
+ and serialization formats). The robustness score for a given model and scenario is
19
+ the mean of the model's instance-level robustness score across all instances in that scenario.
20
+
21
+ The core HELM framework does not natively support computing metrics that depend on
22
+ per-instance metrics across multiple runs, therefore this special summarizer is needed
23
+ to compute this robustness metic."""
24
+
25
+ def __init__(
26
+ self,
27
+ release: Optional[str],
28
+ suites: Optional[List[str]],
29
+ suite: Optional[str],
30
+ schema_path: str,
31
+ output_path: str,
32
+ verbose: bool,
33
+ num_threads: int,
34
+ allow_unknown_models: bool,
35
+ ):
36
+ super().__init__(
37
+ release,
38
+ suites,
39
+ suite,
40
+ schema_path,
41
+ output_path,
42
+ verbose,
43
+ num_threads,
44
+ allow_unknown_models,
45
+ )
46
+ self.run_group_to_model_name_to_robustness: Dict[str, Dict[str, float]] = {}
47
+
48
+ PERFORMANCE_METRIC_GROUP_NAME = "performance_metrics"
49
+ ROBUSTNESS_METRIC_GROUP_NAME = "robustness_metrics"
50
+ ROBUSTNESS_METRIC_NAME = "robustness"
51
+
52
+ def _get_instance_id_to_performance(
53
+ self, run: Run, performance_metric_matcher: MetricNameMatcher
54
+ ) -> Dict[str, float]:
55
+ with open(os.path.join(run.run_path, "per_instance_stats.json")) as f:
56
+ per_instance_stats = from_json(f.read(), List[PerInstanceStats])
57
+ instance_id_to_performance: Dict[str, float] = {}
58
+ for per_instance_stats_item in per_instance_stats:
59
+ assert per_instance_stats_item.train_trial_index == 0
60
+ assert per_instance_stats_item.perturbation is None
61
+ for stat in per_instance_stats_item.stats:
62
+ if performance_metric_matcher.matches(stat.name):
63
+ assert per_instance_stats_item.instance_id not in instance_id_to_performance
64
+ if stat.mean is not None:
65
+ instance_id_to_performance[per_instance_stats_item.instance_id] = stat.mean
66
+
67
+ return instance_id_to_performance
68
+
69
+ def _compute_robustness_for_runs(self, runs: List[Run], performance_metric_matcher: MetricNameMatcher) -> float:
70
+ instance_id_to_performances: Dict[str, List[float]] = {}
71
+ for run in runs:
72
+ for instance_id, performance in self._get_instance_id_to_performance(
73
+ run, performance_metric_matcher
74
+ ).items():
75
+ if instance_id not in instance_id_to_performances:
76
+ instance_id_to_performances[instance_id] = []
77
+ instance_id_to_performances[instance_id].append(performance)
78
+ instance_id_to_robustness: Dict[str, float] = {}
79
+ for instance_id, performances in instance_id_to_performances.items():
80
+ instance_id_to_robustness[instance_id] = 1 - (max(performances) - min(performances))
81
+ return sum(instance_id_to_robustness.values()) / len(instance_id_to_robustness.values())
82
+
83
+ def _compute_robustness_for_run_group(self, run_group: RunGroup) -> Dict[str, float]:
84
+ performance_metric_group = self.schema.name_to_metric_group[self.PERFORMANCE_METRIC_GROUP_NAME]
85
+ assert len(performance_metric_group.metrics) == 1
86
+ performance_metric_matcher = performance_metric_group.metrics[0].substitute(run_group.environment)
87
+
88
+ group_runs = [run for run in self.runs if run_group.name in run.run_spec.groups]
89
+ model_name_to_runs: Dict[str, List[Run]] = {}
90
+
91
+ for run in group_runs:
92
+ model_name = run.run_spec.adapter_spec.model
93
+ if model_name not in model_name_to_runs:
94
+ model_name_to_runs[model_name] = []
95
+ model_name_to_runs[run.run_spec.adapter_spec.model].append(run)
96
+
97
+ model_to_robustness: Dict[str, float] = {}
98
+ for model_name, model_runs in model_name_to_runs.items():
99
+ model_to_robustness[model_name] = self._compute_robustness_for_runs(model_runs, performance_metric_matcher)
100
+ return model_to_robustness
101
+
102
+ def write_groups(self):
103
+ for run_group in self.schema.run_groups:
104
+ if self.ROBUSTNESS_METRIC_GROUP_NAME and self.PERFORMANCE_METRIC_GROUP_NAME in run_group.metric_groups:
105
+ self.run_group_to_model_name_to_robustness[run_group.name] = self._compute_robustness_for_run_group(
106
+ run_group
107
+ )
108
+ return super().write_groups()
109
+
110
+ def create_cell(
111
+ self,
112
+ runs: List[Run],
113
+ matcher: MetricNameMatcher,
114
+ additional_info: Optional[str],
115
+ hide_value: bool = False,
116
+ is_scenario_table: bool = False,
117
+ ) -> Cell:
118
+ """
119
+ Use the metric name identified by `matcher` to pull out the stats from
120
+ `runs` and return a representation of the average.
121
+ There are four cases:
122
+ 1. No matching runs
123
+ 2. Matching runs but no matching stats (maybe stat was named incorrectly)
124
+ 3. Matching runs, matching stats, but stats have count = 0, so mean is undefined
125
+ (e.g., bias metric ran and computed 0/0)
126
+ 4. Matching runs, matching stats, stats with count > 0
127
+
128
+ In the first three cases, the cell value is None, but the description distinguishes between these cases.
129
+ """
130
+ if matcher.name != self.ROBUSTNESS_METRIC_NAME:
131
+ return super().create_cell(runs, matcher, additional_info, hide_value, is_scenario_table)
132
+
133
+ if len(runs) == 0:
134
+ return Cell(value=None, description="No matching runs")
135
+
136
+ # Link the runs that this cell was aggregated from, if this is not a scenario table.
137
+ # Scenario tables link to the runs in the model cells,
138
+ # whereas non-scenario tables link to the runs in the metrics cells.
139
+ run_spec_names: Optional[List] = None
140
+ if not is_scenario_table:
141
+ # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
142
+ # run specs if a run spec belongs to multiple groups.
143
+ run_spec_names = []
144
+ run_spec_names_set = set()
145
+ for run in runs:
146
+ if run.run_spec.name not in run_spec_names_set:
147
+ run_spec_names.append(run.run_spec.name)
148
+ run_spec_names_set.add(run.run_spec.name)
149
+
150
+ run_group_set = set(runs[0].run_spec.groups) & set(self.run_group_to_model_name_to_robustness.keys())
151
+ assert len(run_group_set) == 1
152
+ run_group = next(iter(run_group_set))
153
+
154
+ model_names_set = set(run.run_spec.adapter_spec.model for run in runs)
155
+ assert len(model_names_set) == 1
156
+ model_name = next(iter(model_names_set))
157
+
158
+ value = (
159
+ self.run_group_to_model_name_to_robustness[run_group][model_name]
160
+ if self.run_group_to_model_name_to_robustness[run_group]
161
+ and self.run_group_to_model_name_to_robustness[run_group][model_name]
162
+ else None
163
+ )
164
+ description = str(round(value, 3)) if value is not None else ""
165
+ if hide_value:
166
+ value = None
167
+ description = ""
168
+ if additional_info:
169
+ description += "\n" + additional_info
170
+ if self.verbose:
171
+ description += "\n-- ".join(["\nRun specs:", *(run_spec_names or [])])
172
+
173
+ return Cell(
174
+ value=value,
175
+ description=description,
176
+ style={},
177
+ run_spec_names=run_spec_names,
178
+ )
@@ -0,0 +1,203 @@
1
+ import argparse
2
+ from dataclasses import replace
3
+ import re
4
+ from typing import List
5
+
6
+ from helm.benchmark import model_metadata_registry
7
+ from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
+ from helm.common.general import ensure_directory_exists
9
+ from helm.common.hierarchical_logger import hlog, htrack
10
+ from helm.common.authentication import Authentication
11
+ from helm.proxy.services.remote_service import create_authentication, add_service_args
12
+
13
+ from helm.benchmark.config_registry import (
14
+ register_configs_from_directory,
15
+ register_builtin_configs_from_helm_package,
16
+ )
17
+ from helm.benchmark.runner import set_benchmark_output_path
18
+ from helm.common.reeval_parameters import REEvalParameters
19
+ from helm.benchmark.run import (
20
+ run_benchmarking,
21
+ validate_args,
22
+ add_run_args,
23
+ run_entries_to_run_specs,
24
+ )
25
+
26
+
27
+ @htrack(None)
28
+ def main():
29
+ parser = argparse.ArgumentParser()
30
+ add_service_args(parser)
31
+ parser.add_argument(
32
+ "-c",
33
+ "--conf-paths",
34
+ nargs="+",
35
+ help="Where to read RunSpecs to run from",
36
+ default=[],
37
+ )
38
+ parser.add_argument(
39
+ "--models-to-run",
40
+ nargs="+",
41
+ help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
42
+ default=None,
43
+ )
44
+ parser.add_argument(
45
+ "--groups-to-run",
46
+ nargs="+",
47
+ help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
48
+ default=None,
49
+ )
50
+ parser.add_argument(
51
+ "--exit-on-error",
52
+ action="store_true",
53
+ help="Fail and exit immediately if a particular RunSpec fails.",
54
+ )
55
+ parser.add_argument(
56
+ "--skip-completed-runs",
57
+ action="store_true",
58
+ help="Skip RunSpecs that have completed i.e. output files exists.",
59
+ )
60
+ parser.add_argument(
61
+ "--priority",
62
+ type=int,
63
+ default=None,
64
+ help="Run RunSpecs with priority less than or equal to this number. "
65
+ "If a value for --priority is not specified, run on everything",
66
+ )
67
+ parser.add_argument(
68
+ "--run-specs",
69
+ nargs="*",
70
+ help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
71
+ "Specifies run entries to run.",
72
+ default=[],
73
+ )
74
+ parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
75
+ parser.add_argument(
76
+ "--enable-huggingface-models",
77
+ nargs="+",
78
+ default=[],
79
+ help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
80
+ "Format: namespace/model_name[@revision]",
81
+ )
82
+ parser.add_argument(
83
+ "--enable-local-huggingface-models",
84
+ nargs="+",
85
+ default=[],
86
+ help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
87
+ )
88
+ # reeval parameters
89
+ parser.add_argument(
90
+ "--model-ability",
91
+ type=float,
92
+ default=0.0,
93
+ help="The initial ability of the model for reeval evaluation.",
94
+ )
95
+ add_run_args(parser)
96
+ args = parser.parse_args()
97
+ validate_args(args)
98
+
99
+ register_builtin_configs_from_helm_package()
100
+ register_configs_from_directory(args.local_path)
101
+
102
+ if args.enable_huggingface_models:
103
+ from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
104
+
105
+ for huggingface_model_name in args.enable_huggingface_models:
106
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
107
+
108
+ if args.enable_local_huggingface_models:
109
+ from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
110
+
111
+ for huggingface_model_path in args.enable_local_huggingface_models:
112
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
113
+
114
+ run_entries: List[RunEntry] = []
115
+ if args.conf_paths:
116
+ run_entries.extend(read_run_entries(args.conf_paths).entries)
117
+ if args.run_entries:
118
+ run_entries.extend(
119
+ [RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
120
+ )
121
+ # TODO: Remove this eventually.
122
+ if args.run_specs:
123
+ run_entries.extend(
124
+ [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
125
+ )
126
+
127
+ # Must set benchmark output path before getting RunSpecs,
128
+ # because run spec functions can use the benchmark output directory for caching.
129
+ ensure_directory_exists(args.output_path)
130
+ set_benchmark_output_path(args.output_path)
131
+
132
+ # Validate the --models-to-run flag
133
+ if args.models_to_run:
134
+ all_models = set(model_metadata_registry.get_all_models())
135
+ for model_to_run in args.models_to_run:
136
+ if model_to_run not in all_models:
137
+ raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
138
+ else:
139
+ model_expander_pattern = re.compile(
140
+ r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
141
+ )
142
+ if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
143
+ raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
144
+
145
+ run_specs = run_entries_to_run_specs(
146
+ run_entries=run_entries,
147
+ max_eval_instances=args.max_eval_instances,
148
+ num_train_trials=args.num_train_trials,
149
+ models_to_run=args.models_to_run,
150
+ groups_to_run=args.groups_to_run,
151
+ priority=args.priority,
152
+ )
153
+ hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
154
+
155
+ if len(run_specs) == 0:
156
+ hlog("There were no RunSpecs or they got filtered out.")
157
+ return
158
+
159
+ # Add reeval_parameters
160
+ run_specs = [
161
+ replace(
162
+ run_spec,
163
+ adapter_spec=replace(
164
+ run_spec.adapter_spec, reeval_parameters=REEvalParameters(model_ability=args.model_ability)
165
+ ),
166
+ )
167
+ for run_spec in run_specs
168
+ ]
169
+
170
+ auth: Authentication = (
171
+ Authentication("") if args.skip_instances or not args.server_url else create_authentication(args)
172
+ )
173
+
174
+ run_benchmarking(
175
+ run_specs=run_specs,
176
+ auth=auth,
177
+ url=args.server_url,
178
+ local_path=args.local_path,
179
+ num_threads=args.num_threads,
180
+ output_path=args.output_path,
181
+ suite=args.suite,
182
+ dry_run=args.dry_run,
183
+ skip_instances=args.skip_instances,
184
+ cache_instances=args.cache_instances,
185
+ cache_instances_only=args.cache_instances_only,
186
+ skip_completed_runs=args.skip_completed_runs,
187
+ exit_on_error=args.exit_on_error,
188
+ runner_class_name="helm.benchmark.reeval_runner.REEvalRunner",
189
+ mongo_uri=args.mongo_uri,
190
+ disable_cache=args.disable_cache,
191
+ )
192
+
193
+ if args.run_specs:
194
+ hlog(
195
+ "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
196
+ "Use --run-entries instead."
197
+ )
198
+
199
+ hlog("Done.")
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()