crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,52 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.metrics.metric_name import MetricName
5
+ from helm.benchmark.metrics.statistic import Stat
6
+ from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
7
+ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
8
+ from helm.benchmark.scenarios.scenario import (
9
+ CORRECT_TAG,
10
+ )
11
+ from sklearn.metrics import f1_score, accuracy_score
12
+
13
+
14
+ class UltraSuiteASRMetric(EvaluateInstancesMetric):
15
+ """Score metrics for UltraSuite ASR."""
16
+
17
+ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
18
+ y_pred: List[str] = []
19
+ y_pred_quasi: List[str] = []
20
+ y_true: List[str] = []
21
+ for request_state in request_states: # one request state per instance
22
+
23
+ for reference in request_state.instance.references:
24
+ if reference.tags == [CORRECT_TAG]:
25
+ true_label = reference.output.text
26
+ break
27
+
28
+ assert request_state.result
29
+ model_output_text = request_state.result.completions[0].text.strip().lower()
30
+ assert request_state.instance.extra_data
31
+ ground_truth_text = request_state.instance.extra_data["transcription"].strip().lower()
32
+
33
+ if model_output_text == ground_truth_text:
34
+ predicted_label = "typically_developing"
35
+ else:
36
+ predicted_label = "speech_disorder"
37
+
38
+ if normalize_text(predicted_label) == normalize_text(true_label):
39
+ quasi_label = "typically_developing"
40
+ else:
41
+ quasi_label = "speech_disorder"
42
+
43
+ y_true.append(true_label)
44
+ y_pred.append(predicted_label)
45
+ y_pred_quasi.append(quasi_label)
46
+
47
+ return [
48
+ Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
49
+ Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
50
+ Stat(MetricName("exact_match")).add(accuracy_score(y_pred=y_pred, y_true=y_true)),
51
+ Stat(MetricName("quasi_exact_match")).add(accuracy_score(y_pred=y_pred_quasi, y_true=y_true)),
52
+ ]
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
32
32
  Stat(MetricName("wildbench_score")).add(score),
33
33
  Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
34
34
  ]
35
+
36
+ def get_metadata(self) -> List[MetricMetadata]:
37
+ return [
38
+ MetricMetadata(
39
+ name="wildbench_score",
40
+ display_name="WildBench Score",
41
+ short_display_name="WB Score",
42
+ description="Score of the AI output judged by GPT-4o.",
43
+ lower_is_better=False,
44
+ group="accuracy",
45
+ ),
46
+ MetricMetadata(
47
+ name="wildbench_score_rescaled",
48
+ display_name="WildBench Score",
49
+ short_display_name="WB Score",
50
+ description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
51
+ lower_is_better=False,
52
+ group="accuracy",
53
+ ),
54
+ ]
@@ -157,12 +157,11 @@ def get_default_model_deployment_for_model(
157
157
  Example: "meta/llama-7b" => "together/llama-7b"
158
158
 
159
159
  The process to find a model deployment name is as follows:
160
- 1. If there is a model deployment with the same name as the model arg, use it.
161
- 2. If there is at least one deployment for the model, use the first one that is available.
162
- 3. If there are no deployments for the model, returns None.
160
+ 1. If there is at least one deployment for the model, use the last one that is available.
161
+ 2. If there are no deployments for the model, returns None.
163
162
 
164
163
  This function will also try to find a model deployment name that is not deprecated.
165
- If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
164
+ If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
166
165
  If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
167
166
 
168
167
  If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
@@ -175,16 +174,7 @@ def get_default_model_deployment_for_model(
175
174
  ignore_deprecated: Whether to return None if the model deployment is deprecated.
176
175
  """
177
176
 
178
- # If there is a model deployment with the same name as the model arg, use it.
179
- if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
180
- deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
181
- if deployment.deprecated and ignore_deprecated:
182
- if warn_arg_deprecated:
183
- hwarn(f"Model deployment {model_name} is deprecated")
184
- return None
185
- return deployment.name
186
-
187
- # If there is at least one deployment for the model, use the first one that is available.
177
+ # If there is at least one deployment for the model, use the last one that is available.
188
178
  available_deployments: List[ModelDeployment] = [
189
179
  deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
190
180
  ]
@@ -199,19 +189,21 @@ def get_default_model_deployment_for_model(
199
189
  deployment for deployment in available_deployments if not deployment.deprecated
200
190
  ]
201
191
  if len(non_deprecated_deployments) > 0:
202
- chosen_deployment = non_deprecated_deployments[0]
192
+ chosen_deployment = non_deprecated_deployments[-1]
203
193
  # There are no non-deprecated deployments, so there are two options:
204
194
  # 1. If we can return an empty string, return it. (no model deployment is available)
205
- # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
195
+ # 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
206
196
  elif ignore_deprecated:
207
197
  return None
208
- else:
209
- chosen_deployment = available_deployments[0]
198
+ elif len(available_deployments) > 0:
199
+ chosen_deployment = available_deployments[-1]
210
200
  if warn_arg_deprecated:
211
201
  hwarn(f"All model deployments for model {model_name} are deprecated.")
202
+ else:
203
+ return None
212
204
  if warn_arg_deprecated:
213
205
  hlog(
214
- f"Choosing {chosen_deployment.name} (the first one) as "
206
+ f"Choosing {chosen_deployment.name} (the last one) as "
215
207
  f"the default model deployment for model {model_name}"
216
208
  )
217
209
  hlog("If you want to use a different model deployment, please specify it explicitly.")
@@ -1,4 +1,7 @@
1
- # mypy: check_untyped_defs = False
1
+ # type: ignore
2
+ # flake8: noqa
3
+ # fmt: off
4
+
2
5
  import argparse
3
6
  from collections import defaultdict
4
7
  from dataclasses import dataclass
@@ -637,8 +640,14 @@ def main():
637
640
  default="png",
638
641
  choices=["png", "pdf"],
639
642
  )
643
+ parser.add_argument(
644
+ "--log-config",
645
+ type=str,
646
+ default=None,
647
+ help="PATH to a YAML file to customize logging",
648
+ )
640
649
  args = parser.parse_args()
641
- setup_default_logging()
650
+ setup_default_logging(args.log_config)
642
651
  create_plots(args)
643
652
 
644
653
 
@@ -1,6 +1,7 @@
1
1
  from collections import OrderedDict, defaultdict
2
2
  from dataclasses import dataclass
3
3
  import os
4
+ import re
4
5
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
5
6
 
6
7
  from helm.benchmark.adaptation.adapter_spec import (
@@ -262,9 +263,18 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
262
263
  if request_state.result is not None and request_state.result.completions
263
264
  else ""
264
265
  )
265
- mapped_output = (
266
- request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
267
- )
266
+ mapped_output: Optional[str] = None
267
+ if request_state.output_mapping is not None:
268
+ output_to_map = predicted_text.strip()
269
+ if run_spec.adapter_spec.output_mapping_pattern:
270
+ match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
271
+ if not match:
272
+ output_to_map = ""
273
+ elif match.groups():
274
+ output_to_map = match.group(0)
275
+ else:
276
+ output_to_map = match.string
277
+ mapped_output = request_state.output_mapping.get(output_to_map)
268
278
  instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
269
279
  request_state.instance
270
280
  )
@@ -14,10 +14,10 @@ class RunEntry:
14
14
  description: str
15
15
 
16
16
  # Priority for this run spec (1 is highest priority, 5 is lowest priority)
17
- priority: int
17
+ priority: Optional[int] = None
18
18
 
19
19
  # Additional groups to add to the run spec
20
- groups: Optional[List[str]]
20
+ groups: Optional[List[str]] = None
21
21
 
22
22
 
23
23
  @dataclass(frozen=True)
@@ -8,6 +8,7 @@ import mako.template
8
8
  import yaml
9
9
  import importlib_resources as resources
10
10
 
11
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
11
12
  from helm.common.general import hlog
12
13
  from helm.benchmark.metrics.metric_name import MetricName
13
14
  from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
131
132
  NO_GROUPS = "no_groups"
132
133
 
133
134
 
134
- @dataclass(frozen=True)
135
- class TaxonomyInfo:
136
- # Task (e.g., question answering)
137
- task: Optional[str] = None
138
-
139
- # Domain - genre (e.g., Wikipedia)
140
- what: Optional[str] = None
141
-
142
- # Domain - when it was written (e.g., 2010s)
143
- when: Optional[str] = None
144
-
145
- # Domain - demographics (e.g., web users)
146
- who: Optional[str] = None
147
-
148
- # Language (e.g., English)
149
- language: Optional[str] = None
150
-
151
-
152
135
  @dataclass(frozen=True)
153
136
  class RunGroup(Field):
154
137
  """
@@ -205,22 +188,27 @@ class RunGroup(Field):
205
188
  # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
206
189
  adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
207
190
 
191
+ # Optional short description of the run group.
192
+ # This description is used in some space-constrained places in frontend tables.
193
+ # If unset, the description field will be used instead.
194
+ short_description: Optional[str] = None
195
+
208
196
 
209
197
  @dataclass
210
198
  class Schema:
211
199
  """Specifies information about what to display on the frontend."""
212
200
 
213
201
  # Information about each field
214
- metrics: List[Field]
202
+ metrics: List[Field] = field(default_factory=list)
215
203
 
216
204
  # Information about each perturbation
217
- perturbations: List[Field]
205
+ perturbations: List[Field] = field(default_factory=list)
218
206
 
219
207
  # Group the metrics
220
- metric_groups: List[MetricGroup]
208
+ metric_groups: List[MetricGroup] = field(default_factory=list)
221
209
 
222
210
  # Group the scenarios
223
- run_groups: List[RunGroup]
211
+ run_groups: List[RunGroup] = field(default_factory=list)
224
212
 
225
213
  # Adapter fields (e.g., temperature)
226
214
  # Automatically populated from the docstrings in the AdapterSpec class definition.
@@ -9,6 +9,7 @@ Usage:
9
9
  """
10
10
 
11
11
  import argparse
12
+ import dataclasses
12
13
  import os
13
14
  import datetime
14
15
  import urllib.parse
@@ -31,18 +32,26 @@ from helm.common.general import (
31
32
  )
32
33
  from helm.common.codec import from_json
33
34
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
34
- from helm.benchmark.scenarios.scenario import ScenarioSpec
35
+ from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
35
36
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
36
37
  from helm.benchmark.metrics.metric_name import MetricName
37
- from helm.benchmark.metrics.metric import get_all_stats_by_name
38
+ from helm.benchmark.metrics.metric import (
39
+ MetricInterface,
40
+ MetricMetadata,
41
+ MetricSpec,
42
+ create_metric,
43
+ get_all_stats_by_name,
44
+ )
38
45
  from helm.benchmark.metrics.statistic import Stat, merge_stat
39
46
  from helm.benchmark.run_spec import RunSpec
40
47
  from helm.benchmark.runner import LATEST_SYMLINK
41
48
  from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
42
49
  from helm.benchmark.presentation.schema import (
50
+ MetricGroup,
43
51
  MetricNameMatcher,
44
52
  RunGroup,
45
53
  Field,
54
+ Schema,
46
55
  read_schema,
47
56
  get_default_schema_path,
48
57
  BY_GROUP,
@@ -294,7 +303,6 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
294
303
 
295
304
 
296
305
  class AggregationStrategy:
297
- # TODO: Convert to StrEnum after upgrading to Python 3.11
298
306
  WIN_RATE = "win_rate"
299
307
  MEAN = "mean"
300
308
 
@@ -342,7 +350,7 @@ class Summarizer:
342
350
  release: Optional[str],
343
351
  suites: Optional[List[str]],
344
352
  suite: Optional[str],
345
- schema_path: str,
353
+ schema_path: Optional[str],
346
354
  output_path: str,
347
355
  verbose: bool,
348
356
  num_threads: int,
@@ -377,10 +385,8 @@ class Summarizer:
377
385
  self.verbose: bool = verbose
378
386
  self.num_threads: int = num_threads
379
387
  self.allow_unknown_models: bool = allow_unknown_models
380
-
381
- ensure_directory_exists(self.run_release_path)
382
-
383
- self.schema = read_schema(schema_path)
388
+ self.schema = read_schema(schema_path) if schema_path else Schema()
389
+ self.metric_metadata: List[MetricMetadata] = []
384
390
 
385
391
  def read_run(self, run_path: str) -> Run:
386
392
  """Load the `Run` object from `run_path`."""
@@ -427,6 +433,8 @@ class Summarizer:
427
433
 
428
434
  def read_runs_for_suite(self, suite, run_suite_path):
429
435
  """Load the runs in the run suite path."""
436
+ if not os.path.exists(run_suite_path):
437
+ raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
430
438
  # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
431
439
  # so filter them out.
432
440
  run_dir_names = sorted(
@@ -510,6 +518,150 @@ class Summarizer:
510
518
  model_field_dicts.append(asdict_without_nones(model_field))
511
519
  return model_field_dicts
512
520
 
521
+ def get_metric_metadata(self) -> List[MetricMetadata]:
522
+ if self.metric_metadata:
523
+ return self.metric_metadata
524
+ metric_specs: List[MetricSpec] = []
525
+ for run in self.runs:
526
+ metric_specs.extend(run.run_spec.metric_specs)
527
+ metric_specs = list(set(metric_specs))
528
+ metric_name_to_metadata: Dict[str, MetricMetadata] = {}
529
+ for metric_spec in metric_specs:
530
+ try:
531
+ metric: MetricInterface = create_metric(metric_spec)
532
+ metric_metadata_list = metric.get_metadata()
533
+ for metric_metadata in metric_metadata_list:
534
+ metric_name_to_metadata[metric_metadata.name] = metric_metadata
535
+ except NotImplementedError:
536
+ pass
537
+ except (ModuleNotFoundError, AttributeError, TypeError):
538
+ pass
539
+
540
+ run_stat_names: Set[str] = set()
541
+ for run in self.runs:
542
+ for stat in run.stats:
543
+ run_stat_names.add(stat.name.name)
544
+
545
+ metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
546
+ for metric_name_to_prune in metric_names_to_prune:
547
+ del metric_name_to_metadata[metric_name_to_prune]
548
+ self.metric_metadata = list(metric_name_to_metadata.values())
549
+ return self.metric_metadata
550
+
551
+ def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
552
+ return Field(
553
+ name=metric_metadata.name,
554
+ display_name=metric_metadata.display_name,
555
+ short_display_name=metric_metadata.short_display_name,
556
+ description=metric_metadata.description,
557
+ lower_is_better=metric_metadata.lower_is_better,
558
+ )
559
+
560
+ def auto_generate_metric_fields(self) -> List[Field]:
561
+ return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
562
+
563
+ def auto_generate_metric_groups(self) -> List[MetricGroup]:
564
+ metric_groups = [
565
+ MetricGroup(
566
+ name="main_metric",
567
+ display_name="Main Metric",
568
+ description="Main Metric",
569
+ metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
570
+ )
571
+ ]
572
+ metric_group_to_metrics: Dict[str, List[str]] = {}
573
+ for metric_metadata in self.metric_metadata:
574
+ if metric_metadata.group:
575
+ if metric_metadata.group not in metric_group_to_metrics:
576
+ metric_group_to_metrics[metric_metadata.group] = []
577
+ metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
578
+ for metric_group, metric_names in metric_group_to_metrics.items():
579
+ display_name = metric_group.replace("_", " ").capitalize()
580
+ metric_groups.append(
581
+ MetricGroup(
582
+ name=metric_group,
583
+ # TODO: Make display_name and description nicer
584
+ display_name=display_name,
585
+ description=display_name,
586
+ aggregation_strategies=[],
587
+ metrics=[
588
+ MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
589
+ ],
590
+ )
591
+ )
592
+ return metric_groups
593
+
594
+ def get_scenario_metadata(self) -> List[ScenarioMetadata]:
595
+ scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
596
+ scenario_specs = list(set(scenario_specs))
597
+ scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
598
+ for scenario_spec in scenario_specs:
599
+ try:
600
+ scenario: Scenario = create_scenario(scenario_spec)
601
+ scenario_metadata = scenario.get_metadata()
602
+ scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
603
+ except NotImplementedError:
604
+ pass
605
+ except (ModuleNotFoundError, AttributeError, TypeError):
606
+ pass
607
+
608
+ run_groups: Set[str] = set()
609
+ for run in self.runs:
610
+ for run_group in run.run_spec.groups:
611
+ run_groups.add(run_group)
612
+
613
+ scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
614
+ for scenario_name_to_prune in scenario_names_to_prune:
615
+ del scenario_name_to_metadata[scenario_name_to_prune]
616
+ return list(scenario_name_to_metadata.values())
617
+
618
+ def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
619
+ metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
620
+ return RunGroup(
621
+ name=scenario_metadata.name,
622
+ display_name=scenario_metadata.display_name,
623
+ short_display_name=scenario_metadata.short_display_name,
624
+ description=scenario_metadata.description,
625
+ metric_groups=metric_group_names,
626
+ environment={
627
+ "main_name": scenario_metadata.main_metric,
628
+ "main_split": scenario_metadata.main_split,
629
+ },
630
+ taxonomy=scenario_metadata.taxonomy,
631
+ )
632
+
633
+ def auto_generate_all_scenarios_run_group(self) -> RunGroup:
634
+ return RunGroup(
635
+ name="all_scenarios",
636
+ display_name="All Scenarios",
637
+ description="All scenarios",
638
+ category="Scenario Groups",
639
+ subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
640
+ )
641
+
642
+ def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
643
+ return [
644
+ self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
645
+ ]
646
+
647
+ def fix_up_schema(self) -> None:
648
+ # if not self.schema.run_groups:
649
+ if not self.schema.metrics:
650
+ self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
651
+ # Can only auto-generate metric groups if metrics were also auto-generated
652
+ # because auto_generate_metric_groups() requires self.metric_metadata()
653
+ # which is populated by auto_generate_metric_fields()
654
+ if not self.schema.metric_groups:
655
+ self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
656
+ if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
657
+ self.schema = dataclasses.replace(
658
+ self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
659
+ )
660
+ if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
661
+ self.schema = dataclasses.replace(
662
+ self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
663
+ )
664
+
513
665
  def write_schema(self) -> None:
514
666
  """Write the schema file to benchmark_output so the frontend knows about it."""
515
667
  # Manually add the model metadata to the schema.json, where the frontend expects it.
@@ -839,7 +991,8 @@ class Summarizer:
839
991
  }
840
992
 
841
993
  header_name = header_field.get_short_display_name()
842
- description = (run_group.description + "\n\n" if run_group.description is not None else "") + (
994
+ run_group_short_description = run_group.short_description or run_group.description or ""
995
+ description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
843
996
  (header_field.display_name if header_field.display_name else header_field.name)
844
997
  + ": "
845
998
  + (header_field.description if header_field.description is not None else "")
@@ -1070,7 +1223,8 @@ class Summarizer:
1070
1223
  is_scenario_table=False,
1071
1224
  aggregation_strategies=aggregate_strategies,
1072
1225
  )
1073
- tables.append(table)
1226
+ if len(table.header) > 1:
1227
+ tables.append(table)
1074
1228
  return tables
1075
1229
 
1076
1230
  def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1213,14 +1367,16 @@ class Summarizer:
1213
1367
  """Run the entire summarization pipeline."""
1214
1368
  self.read_runs()
1215
1369
  self.group_runs()
1216
- self.check_metrics_defined()
1217
1370
 
1218
- self.write_run_display_json(skip_completed)
1371
+ ensure_directory_exists(self.run_release_path)
1219
1372
 
1220
1373
  # Must happen after self.read_runs()
1221
1374
  # because it uses self.runs
1375
+ self.fix_up_schema()
1376
+ self.check_metrics_defined()
1222
1377
  self.write_schema()
1223
1378
 
1379
+ self.write_run_display_json(skip_completed)
1224
1380
  self.write_executive_summary()
1225
1381
  self.write_runs()
1226
1382
  self.write_run_specs()
@@ -1254,7 +1410,15 @@ def summarize(args):
1254
1410
  else:
1255
1411
  raise ValueError("Exactly one of --release or --suite must be specified.")
1256
1412
 
1257
- schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1413
+ schema_path: Optional[str]
1414
+ if args.auto_generate_schema:
1415
+ if args.schema_path:
1416
+ raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
1417
+ schema_path = None
1418
+ elif args.schema_path:
1419
+ schema_path = args.schema_path
1420
+ else:
1421
+ schema_path = get_default_schema_path()
1258
1422
 
1259
1423
  register_builtin_configs_from_helm_package()
1260
1424
  register_configs_from_directory(args.local_path)
@@ -1340,8 +1504,19 @@ def main():
1340
1504
  default=None,
1341
1505
  help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1342
1506
  )
1507
+ parser.add_argument(
1508
+ "--log-config",
1509
+ type=str,
1510
+ default=None,
1511
+ help="PATH to a YAML file to customize logging",
1512
+ )
1513
+ parser.add_argument(
1514
+ "--auto-generate-schema",
1515
+ action="store_true",
1516
+ help="EXPERIMENTAL: Auto-generate schema",
1517
+ )
1343
1518
  args = parser.parse_args()
1344
- setup_default_logging()
1519
+ setup_default_logging(args.log_config)
1345
1520
  summarize(args)
1346
1521
 
1347
1522
 
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class TaxonomyInfo:
7
+ # Task (e.g., question answering)
8
+ task: Optional[str] = None
9
+
10
+ # Domain - genre (e.g., Wikipedia)
11
+ what: Optional[str] = None
12
+
13
+ # Domain - when it was written (e.g., 2010s)
14
+ when: Optional[str] = None
15
+
16
+ # Domain - demographics (e.g., web users)
17
+ who: Optional[str] = None
18
+
19
+ # Language (e.g., English)
20
+ language: Optional[str] = None
@@ -1,4 +1,7 @@
1
- # mypy: check_untyped_defs = False
1
+ # type: ignore
2
+ # flake8: noqa
3
+ # fmt: off
4
+
2
5
  from helm.common.general import asdict_without_nones
3
6
  from helm.benchmark.presentation.table import Table, Cell, HeaderCell
4
7
  from helm.benchmark.presentation.create_plots import parse_table
helm/benchmark/run.py CHANGED
@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
37
37
  run_specs: List[RunSpec] = []
38
38
  for entry in run_entries:
39
39
  # Filter by priority
40
- if priority is not None and entry.priority > priority:
40
+ if priority is not None and entry.priority is not None and entry.priority > priority:
41
41
  continue
42
42
 
43
43
  for run_spec in construct_run_specs(parse_object_spec(entry.description)):
@@ -298,8 +298,7 @@ def helm_run(args):
298
298
  hlog("Done.")
299
299
 
300
300
 
301
- # Separate parsing from starting HELM so we can setup logging
302
- def main():
301
+ def build_parser():
303
302
  parser = argparse.ArgumentParser()
304
303
  add_service_args(parser)
305
304
  parser.add_argument(
@@ -365,9 +364,21 @@ def main():
365
364
  default=None,
366
365
  help="Full class name of the Runner class to use. If unset, uses the default Runner.",
367
366
  )
367
+ parser.add_argument(
368
+ "--log-config",
369
+ type=str,
370
+ default=None,
371
+ help="PATH to a YAML file to customize logging",
372
+ )
368
373
  add_run_args(parser)
374
+ return parser
375
+
376
+
377
+ # Separate parsing from starting HELM so we can setup logging
378
+ def main():
379
+ parser = build_parser()
369
380
  args = parser.parse_args()
370
- setup_default_logging()
381
+ setup_default_logging(args.log_config)
371
382
  return helm_run(args)
372
383
 
373
384
 
@@ -1484,6 +1484,8 @@ class OutputFormatInstructions(RunExpander):
1484
1484
  instructions = "Answer with only a single letter. Do not include a period in your answer."
1485
1485
  elif self.scenario == "mcqa_only_last_question":
1486
1486
  instructions = "Answer only the last question with only a single letter."
1487
+ elif self.scenario == "arabic_mcqa":
1488
+ instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
1487
1489
  else:
1488
1490
  instructions = "Answer with only a single letter."
1489
1491
  elif run_spec.adapter_spec.method == ADAPT_GENERATION:
@@ -1525,6 +1527,8 @@ class OutputFormatInstructions(RunExpander):
1525
1527
  "Answer only the last question with a short answer. "
1526
1528
  "Avoid extra, unnecessary information in the answer."
1527
1529
  )
1530
+ elif self.scenario == "arabic_mcqa":
1531
+ instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
1528
1532
  else:
1529
1533
  raise ValueError(f"Unknown scenario {self.scenario}")
1530
1534
  elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: