crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,59 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.bluex_scenario import BLUEXScenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_bluex_scenario():
10
+ scenario = BLUEXScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ assert len(instances) > 100
15
+
16
+ assert instances[100].split == TEST_SPLIT
17
+
18
+ assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
19
+
20
+ assert len(instances[0].input.text) == 1011
21
+
22
+ assert instances[0].references == [
23
+ Reference(
24
+ output=Output(
25
+ text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
26
+ ),
27
+ tags=[],
28
+ ),
29
+ Reference(
30
+ output=Output(
31
+ text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
32
+ 'outra coisa".'
33
+ ),
34
+ tags=[],
35
+ ),
36
+ Reference(
37
+ output=Output(
38
+ text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
39
+ "o presente."
40
+ ),
41
+ tags=[],
42
+ ),
43
+ Reference(
44
+ output=Output(
45
+ text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
46
+ 'linhas tortas".'
47
+ ),
48
+ tags=[],
49
+ ),
50
+ Reference(
51
+ output=Output(
52
+ text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
53
+ 'desgraça...".'
54
+ ),
55
+ tags=[CORRECT_TAG],
56
+ ),
57
+ ]
58
+
59
+ assert instances[0].references[4].is_correct
@@ -0,0 +1,29 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_exam_multilingual_scenario_get_instances():
10
+ scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 393
14
+ assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
15
+ assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
16
+ assert len(actual_instances[0].references) == 4
17
+ assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
18
+ assert actual_instances[0].references[0].tags == []
19
+ assert (
20
+ actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
21
+ )
22
+ assert actual_instances[0].references[1].tags == []
23
+ assert (
24
+ actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
25
+ )
26
+ assert actual_instances[0].references[2].tags == []
27
+ assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
28
+ assert actual_instances[0].references[3].tags == [CORRECT_TAG]
29
+ assert actual_instances[0].split == TRAIN_SPLIT
@@ -0,0 +1,57 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_healthqa_br_instance():
10
+ scenario = HEALTHQA_BR_Scenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ instance = instances[35]
15
+
16
+ assert instance.split == TEST_SPLIT
17
+
18
+ assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
19
+
20
+ assert instance.references == [
21
+ Reference(
22
+ output=Output(
23
+ text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
24
+ "tentativa de redução do volume."
25
+ ),
26
+ tags=[],
27
+ ),
28
+ Reference(
29
+ output=Output(
30
+ text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
31
+ "imediata do cirurgião."
32
+ ),
33
+ tags=[CORRECT_TAG],
34
+ ),
35
+ Reference(
36
+ output=Output(
37
+ text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
38
+ "abdominal."
39
+ ),
40
+ tags=[],
41
+ ),
42
+ Reference(
43
+ output=Output(
44
+ text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
45
+ ),
46
+ tags=[],
47
+ ),
48
+ Reference(
49
+ output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
50
+ tags=[],
51
+ ),
52
+ ]
53
+
54
+ correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
55
+ assert len(correct_refs) == 1
56
+
57
+ assert instance.references[1].is_correct
@@ -2,6 +2,7 @@ import os
2
2
  from typing import Dict, List
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -142,3 +144,96 @@ class ThaiExamScenario(Scenario):
142
144
  instances.extend(self.process_jsonl(jsonl_path, splits[split]))
143
145
 
144
146
  return instances
147
+
148
+ def get_metadata(self) -> ScenarioMetadata:
149
+ if self.exam == "onet":
150
+ return ScenarioMetadata(
151
+ name="thai_exam_onet",
152
+ display_name="ONET",
153
+ description="The Ordinary National Educational Test (ONET) is an examination for students "
154
+ "in Thailand. We select the grade-12 ONET exam, which comprises 5 subjects and "
155
+ "each question has 5 choices. These subjects are Thai, English, Mathematics, "
156
+ "Social Studies, and Science. Amounting to a total of 170 questions and "
157
+ "options.\n",
158
+ taxonomy=TaxonomyInfo(
159
+ task="question answering",
160
+ what="high school / medical school academic knowledge",
161
+ when="?",
162
+ who="n/a",
163
+ language="Thai and English",
164
+ ),
165
+ main_metric="exact_match",
166
+ main_split="test",
167
+ )
168
+ elif self.exam == "ic":
169
+ return ScenarioMetadata(
170
+ name="thai_exam_ic",
171
+ display_name="IC",
172
+ description="The Investment Consultant (IC) examination, a licensing test for investment "
173
+ "professionals in Thailand. Developed by the Stock Exchange of Thailand (SET), "
174
+ "features 4 choices per question. We extracted questions for levels 1, 2, and 3 "
175
+ "resulting in a total of 95 questions and options.\n",
176
+ taxonomy=TaxonomyInfo(
177
+ task="question answering",
178
+ what="licensing for investment professionals",
179
+ when="?",
180
+ who="n/a",
181
+ language="Thai",
182
+ ),
183
+ main_metric="exact_match",
184
+ main_split="test",
185
+ )
186
+ elif self.exam == "tgat":
187
+ return ScenarioMetadata(
188
+ name="thai_exam_tgat",
189
+ display_name="TGAT",
190
+ description="The Thai General Aptitude Test (TGAT), a national high school examination in "
191
+ "Thailand. Focuses on critical and logical thinking skills. We collected a "
192
+ "total of 90 questions and answers. The TGAT consists of four choices per "
193
+ "question.\n",
194
+ taxonomy=TaxonomyInfo(
195
+ task="question answering",
196
+ what="high school level questions on reasoning",
197
+ when="?",
198
+ who="n/a",
199
+ language="English",
200
+ ),
201
+ main_metric="exact_match",
202
+ main_split="test",
203
+ )
204
+ elif self.exam == "tpat1":
205
+ return ScenarioMetadata(
206
+ name="thai_exam_tpat1",
207
+ display_name="TPAT-1",
208
+ description="TBD",
209
+ taxonomy=TaxonomyInfo(
210
+ task="question answering",
211
+ what="high school / medical school academic knowledge",
212
+ when="?",
213
+ who="n/a",
214
+ language="Thai",
215
+ ),
216
+ main_metric="exact_match",
217
+ main_split="test",
218
+ )
219
+ elif self.exam == "a_level":
220
+ return ScenarioMetadata(
221
+ name="thai_exam_a_level",
222
+ display_name="A-Level",
223
+ description="An academic knowledge assessment examination (Applied Knowledge Level) that "
224
+ "covers general foundational subjects taught in schools. The content assessed "
225
+ "in this examination aligns with the curriculum guidelines and emphasizes the "
226
+ "practical application of knowledge in daily life. We collected a total of 175 "
227
+ "questions and answers.\n",
228
+ taxonomy=TaxonomyInfo(
229
+ task="question answering",
230
+ what="high school academic knowledge",
231
+ when="?",
232
+ who="n/a",
233
+ language="Thai and English",
234
+ ),
235
+ main_metric="exact_match",
236
+ main_split="test",
237
+ )
238
+ else:
239
+ raise ValueError(f"Unknown exam: {self.exam}")
@@ -5,9 +5,10 @@ import sys
5
5
  import requests
6
6
  from typing import Dict, List
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded
9
10
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
10
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
11
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
11
12
 
12
13
 
13
14
  class ThePileScenario(Scenario):
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
146
147
  instances = [instances[i] for i in indices]
147
148
 
148
149
  return instances
150
+
151
+ def get_metadata(self) -> ScenarioMetadata:
152
+ return ScenarioMetadata(
153
+ name="the_pile",
154
+ display_name="The Pile",
155
+ description="The Pile corpus for measuring lanugage model performance across various "
156
+ "domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
157
+ taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
158
+ main_metric="bits_per_byte",
159
+ main_split="test",
160
+ )
@@ -2,6 +2,7 @@ import csv
2
2
  import os
3
3
  from typing import List, Dict, Any
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
 
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
154
156
  valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
155
157
 
156
158
  return train_instances + valid_instances
159
+
160
+ def get_metadata(self) -> ScenarioMetadata:
161
+ return ScenarioMetadata(
162
+ name="truthful_qa",
163
+ display_name="TruthfulQA",
164
+ description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
165
+ "knowledge in question answering [(Lin et al., "
166
+ "2022)](https://aclanthology.org/2022.acl-long.229/).",
167
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
168
+ main_metric="exact_match",
169
+ main_split="valid",
170
+ )
@@ -2,9 +2,10 @@ import csv
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.common.hierarchical_logger import hlog
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
  CODALAB_URI_TEMPLATE: str = (
10
11
  "https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
56
57
  instances.append(instance)
57
58
 
58
59
  return instances
60
+
61
+ def get_metadata(self) -> ScenarioMetadata:
62
+ return ScenarioMetadata(
63
+ name="twitter_aae",
64
+ display_name="TwitterAAE",
65
+ description="The TwitterAAE corpus of [Blodgett et al. "
66
+ "(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
67
+ "performance in tweets as a function of speaker dialect.",
68
+ taxonomy=TaxonomyInfo(
69
+ task="language modeling",
70
+ what="?",
71
+ when="?",
72
+ who="?",
73
+ language="English (AAE-aligned and White-aligned)",
74
+ ),
75
+ main_metric="bits_per_byte",
76
+ main_split="test",
77
+ )
@@ -2,8 +2,9 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
7
8
 
8
9
 
9
10
  class VicunaScenario(Scenario):
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
47
48
  )
48
49
  instances.append(instance)
49
50
  return instances
51
+
52
+ def get_metadata(self) -> ScenarioMetadata:
53
+ return ScenarioMetadata(
54
+ name="vicuna",
55
+ display_name="Vicuna",
56
+ short_display_name="Vicuna",
57
+ description="The set of prompts used by the "
58
+ "[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
59
+ "instruction-following models.",
60
+ taxonomy=TaxonomyInfo(
61
+ task="open-ended instruction following",
62
+ what="Instructions for LLMs",
63
+ when="Before 2023",
64
+ who="Unknown",
65
+ language="English",
66
+ ),
67
+ main_metric="Helpfulness",
68
+ main_split="test",
69
+ )
@@ -2,6 +2,7 @@ import os
2
2
  from typing import List, Dict
3
3
  import json
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  PID_TO_NAME = {
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
183
185
  instances.append(instance)
184
186
 
185
187
  return instances
188
+
189
+ def get_metadata(self) -> ScenarioMetadata:
190
+ return ScenarioMetadata(
191
+ name="wikifact",
192
+ display_name="WikiFact",
193
+ description="Scenario introduced in this work, inspired by [Petroni et al. "
194
+ "(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
195
+ "knowledge.",
196
+ taxonomy=TaxonomyInfo(
197
+ task="knowledge base completion",
198
+ what="entity-relation-entity triples in natural language form",
199
+ when="?",
200
+ who="automatically generated from templates",
201
+ language="structured English",
202
+ ),
203
+ main_metric="quasi_exact_match",
204
+ main_split="test",
205
+ )
@@ -2,11 +2,13 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
7
8
  Instance,
8
9
  TEST_SPLIT,
9
10
  Input,
11
+ ScenarioMetadata,
10
12
  )
11
13
  from helm.common.general import ensure_directory_exists
12
14
 
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
81
83
  instances.append(instance)
82
84
 
83
85
  return instances
86
+
87
+ def get_metadata(self) -> ScenarioMetadata:
88
+ return ScenarioMetadata(
89
+ name=self.name,
90
+ display_name="WildBench",
91
+ description=self.description,
92
+ main_metric="wildbench_score_rescaled",
93
+ main_split="test",
94
+ taxonomy=TaxonomyInfo(
95
+ task="instruction following",
96
+ what="GPT-judged instruction following with instructions collected from real-user conversations",
97
+ who="real-world users",
98
+ when="2024",
99
+ language="English",
100
+ ),
101
+ )
@@ -1,5 +1,6 @@
1
1
  from typing import List, Any
2
2
  from datasets import load_dataset
3
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
3
4
  from helm.common.hierarchical_logger import htrack_block
4
5
  from helm.benchmark.scenarios.scenario import (
5
6
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  CORRECT_TAG,
12
13
  Input,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
 
@@ -106,3 +108,20 @@ class WMT14Scenario(Scenario):
106
108
  )
107
109
  )
108
110
  return instances
111
+
112
+ def get_metadata(self) -> ScenarioMetadata:
113
+ return ScenarioMetadata(
114
+ name="wmt_14",
115
+ display_name="WMT 2014",
116
+ description="WMT 2014 is a collection of machine translation datasets "
117
+ "[(website)](https://www.statmt.org/wmt14/index.html).",
118
+ taxonomy=TaxonomyInfo(
119
+ task="machine translation",
120
+ what="multilingual sentences",
121
+ when="before 2014",
122
+ who="Europarl, news, Common Crawl, etc.",
123
+ language="English, French, Czech, etc.",
124
+ ),
125
+ main_metric="bleu_4",
126
+ main_split="test",
127
+ )
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
13
13
 
14
14
 
15
15
  class SlurmJobState:
16
- # TODO: Convert to StrEnum after upgrading to Python 3.11
17
16
  # Non-exhaustive list of Slurm job states.
18
17
  # See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
19
18
 
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
81
80
  except subprocess.CalledProcessError as e:
82
81
  # Default CalledProcessError message doesn't have output, so re-raise here to include the output.
83
82
  raise Exception(f"{str(e)} output: {e.output}")
84
- search_result = re.search("JobState=(\w+)", scontrol_output.decode())
83
+ search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
85
84
  if not search_result:
86
85
  raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
87
86
  return search_result.group(1)
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
26
26
  FAILURE_SLURM_JOB_STATES,
27
27
  )
28
28
  from helm.common.general import ensure_directory_exists
29
- from helm.common.hierarchical_logger import hlog, htrack_block
29
+ from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
30
30
 
31
31
  from helm.benchmark.runner_config_registry import RUNNER_CONFIG
32
32
 
@@ -343,7 +343,14 @@ def main():
343
343
  help="Path to the RunSpec JSON file",
344
344
  required=True,
345
345
  )
346
+ parser.add_argument(
347
+ "--log-config",
348
+ type=str,
349
+ default=None,
350
+ help="PATH to a YAML file to customize logging",
351
+ )
346
352
  args = parser.parse_args()
353
+ setup_default_logging(args.log_config)
347
354
 
348
355
  # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
349
356
  with open(args.slurm_runner_spec_path, "r") as f: