crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -0,0 +1,35 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class ALRAGEMetric(Metric):
12
+ def evaluate_generation(
13
+ self,
14
+ adapter_spec: AdapterSpec,
15
+ request_state: RequestState,
16
+ metric_service: MetricService,
17
+ eval_cache_path: str,
18
+ ) -> List[Stat]:
19
+ assert request_state.annotations
20
+ assert "alrage" in request_state.annotations
21
+ return [
22
+ Stat(MetricName("alrage_score")).add(request_state.annotations["alrage"]["score"]),
23
+ ]
24
+
25
+ def get_metadata(self) -> List[MetricMetadata]:
26
+ return [
27
+ MetricMetadata(
28
+ name="alrage_score",
29
+ display_name="ALRAGE Score",
30
+ short_display_name="Score",
31
+ description="Score of the output judged by GPT-4o.",
32
+ lower_is_better=False,
33
+ group="accuracy",
34
+ ),
35
+ ]
@@ -8,7 +8,7 @@ import numpy as np
8
8
  import scipy # type: ignore
9
9
  import calibration as cal # type: ignore
10
10
  from helm.benchmark.adaptation.scenario_state import ScenarioState
11
- from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics
11
+ from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics, get_reference_metrics_metadata
12
12
  from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
13
13
  from helm.benchmark.metrics.reference_metric import ReferenceMetric
14
14
 
@@ -25,7 +25,14 @@ from helm.benchmark.window_services.window_service import WindowService
25
25
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
26
26
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
27
27
  from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance
28
- from helm.benchmark.metrics.metric import Metric, MetricInterface, MetricResult, add_context, get_unique_stat_by_name
28
+ from helm.benchmark.metrics.metric import (
29
+ Metric,
30
+ MetricInterface,
31
+ MetricMetadata,
32
+ MetricResult,
33
+ add_context,
34
+ get_unique_stat_by_name,
35
+ )
29
36
  from helm.benchmark.metrics.metric_name import MetricContext, MetricName
30
37
  from helm.benchmark.metrics.metric_service import MetricService
31
38
  from helm.benchmark.metrics.statistic import Stat, merge_stat
@@ -104,6 +111,35 @@ def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]:
104
111
  return derived_stats
105
112
 
106
113
 
114
+ def _get_perplexity_metrics_metadata() -> List[MetricMetadata]:
115
+ return [
116
+ MetricMetadata(
117
+ name="perplexity",
118
+ display_name="Perplexity",
119
+ short_display_name="PPL",
120
+ description="Perplexity of the output completion (effective branching factor per output token).",
121
+ lower_is_better=True,
122
+ group=None,
123
+ ),
124
+ MetricMetadata(
125
+ name="logprob_per_byte",
126
+ display_name="Log probability / byte",
127
+ short_display_name="Logprob/byte",
128
+ description="Predicted output's average log probability normalized by the number of bytes.",
129
+ lower_is_better=False,
130
+ group=None,
131
+ ),
132
+ MetricMetadata(
133
+ name="bits_per_byte",
134
+ display_name="Bits/byte",
135
+ short_display_name="BPB",
136
+ description="Average number of bits per byte according to model probabilities.",
137
+ lower_is_better=True,
138
+ group=None,
139
+ ),
140
+ ]
141
+
142
+
107
143
  class InstancesPerSplitMetric(MetricInterface):
108
144
  """Report the average num_instances in each MetricContext across train_trials."""
109
145
 
@@ -133,6 +169,16 @@ class InstancesPerSplitMetric(MetricInterface):
133
169
  # There are no per-instance Stats.
134
170
  return MetricResult(list(global_stats.values()), [])
135
171
 
172
+ def get_metadata(self) -> List[MetricMetadata]:
173
+ return [
174
+ MetricMetadata(
175
+ name="num_instances",
176
+ display_name="# eval",
177
+ description="Number of evaluation instances.",
178
+ lower_is_better=None,
179
+ )
180
+ ]
181
+
136
182
 
137
183
  class BasicGenerationMetric(Metric):
138
184
  """
@@ -180,6 +226,15 @@ class BasicGenerationMetric(Metric):
180
226
  derived_stats.extend(compute_calibration_metrics(per_instance_stats))
181
227
  return derived_stats
182
228
 
229
+ def get_metadata(self) -> List[MetricMetadata]:
230
+ return (
231
+ get_request_state_metrics_metadata(self.efficiency_metric)
232
+ + get_reference_metrics_metadata(self.names)
233
+ + _get_language_modeling_metrics_metadata()
234
+ + _get_perplexity_metrics_metadata()
235
+ + _get_calibration_metrics_metadata()
236
+ )
237
+
183
238
 
184
239
  class BasicReferenceMetric(ReferenceMetric):
185
240
  """
@@ -295,6 +350,33 @@ class BasicReferenceMetric(ReferenceMetric):
295
350
  )
296
351
  return stats
297
352
 
353
+ def get_metadata(self) -> List[MetricMetadata]:
354
+ return [
355
+ MetricMetadata(
356
+ name="max_prob",
357
+ display_name="Max prob",
358
+ description="Model's average confidence in its prediction (only computed for classification tasks)",
359
+ lower_is_better=False,
360
+ group="calibration_detailed",
361
+ ),
362
+ MetricMetadata(
363
+ name="exact_match",
364
+ display_name="Exact match",
365
+ short_display_name="EM",
366
+ description="Fraction of instances that the predicted output matches a correct reference exactly.",
367
+ lower_is_better=False,
368
+ group="accuracy",
369
+ ),
370
+ MetricMetadata(
371
+ name="predicted_index",
372
+ display_name="Predicted index",
373
+ description="Integer index of the reference (0, 1, ...) that was predicted by the model (for "
374
+ "multiple-choice).",
375
+ lower_is_better=None,
376
+ group=None,
377
+ ),
378
+ ]
379
+
298
380
 
299
381
  def compute_request_state_metrics(
300
382
  efficiency_metric: EfficiencyMetric,
@@ -319,6 +401,34 @@ def compute_request_state_metrics(
319
401
  return stats
320
402
 
321
403
 
404
+ def get_request_state_metrics_metadata(
405
+ efficiency_metric: EfficiencyMetric,
406
+ ) -> List[MetricMetadata]:
407
+ metric_metadata = [
408
+ MetricMetadata(
409
+ name="num_references",
410
+ display_name="# ref",
411
+ description="Number of references.",
412
+ lower_is_better=None,
413
+ group=None,
414
+ ),
415
+ MetricMetadata(
416
+ name="num_train_trials",
417
+ display_name="# trials",
418
+ description="Number of trials, where in each trial we choose an independent, random set of training "
419
+ "instances.",
420
+ lower_is_better=None,
421
+ group="general_information",
422
+ ),
423
+ ]
424
+ return (
425
+ metric_metadata
426
+ + efficiency_metric.get_metadata()
427
+ + _get_finish_reason_metrics_metadata()
428
+ + _get_truncation_metrics_metadata()
429
+ )
430
+
431
+
322
432
  def _compute_finish_reason_metrics(
323
433
  adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
324
434
  ) -> List[Stat]:
@@ -341,6 +451,40 @@ def _compute_finish_reason_metrics(
341
451
  ]
342
452
 
343
453
 
454
+ def _get_finish_reason_metrics_metadata():
455
+ return [
456
+ MetricMetadata(
457
+ name="finish_reason_endoftext",
458
+ display_name="finish b/c endoftext",
459
+ description="Fraction of instances where the the output was terminated because the end of text token "
460
+ "was generated.",
461
+ lower_is_better=None,
462
+ group=None,
463
+ ),
464
+ MetricMetadata(
465
+ name="finish_reason_length",
466
+ display_name="finish b/c length",
467
+ description="Fraction of instances where the the output was terminated because of the max tokens limit.",
468
+ lower_is_better=None,
469
+ group=None,
470
+ ),
471
+ MetricMetadata(
472
+ name="finish_reason_stop",
473
+ display_name="finish b/c stop",
474
+ description="Fraction of instances where the the output was terminated because of the stop sequences.",
475
+ lower_is_better=None,
476
+ group=None,
477
+ ),
478
+ MetricMetadata(
479
+ name="finish_reason_unknown",
480
+ display_name="finish b/c unknown",
481
+ description="Fraction of instances where the the output was terminated for unknown reasons.",
482
+ lower_is_better=None,
483
+ group=None,
484
+ ),
485
+ ]
486
+
487
+
344
488
  def _compute_truncation_metrics(
345
489
  adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
346
490
  ) -> List[Stat]:
@@ -354,6 +498,26 @@ def _compute_truncation_metrics(
354
498
  ]
355
499
 
356
500
 
501
+ def _get_truncation_metrics_metadata() -> List[MetricMetadata]:
502
+ return [
503
+ MetricMetadata(
504
+ name="num_train_instances",
505
+ display_name="# train",
506
+ description="Number of training instances (e.g., in-context examples).",
507
+ lower_is_better=None,
508
+ ),
509
+ MetricMetadata(
510
+ name="prompt_truncated",
511
+ display_name="truncated",
512
+ description="Fraction of instances where the "
513
+ "prompt itself was truncated (implies "
514
+ "that there were no in-context "
515
+ "examples).",
516
+ lower_is_better=None,
517
+ ),
518
+ ]
519
+
520
+
357
521
  def compute_language_modeling_metrics(
358
522
  adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
359
523
  ) -> List[Stat]:
@@ -387,6 +551,30 @@ def compute_language_modeling_metrics(
387
551
  ]
388
552
 
389
553
 
554
+ def _get_language_modeling_metrics_metadata() -> List[MetricMetadata]:
555
+ return [
556
+ MetricMetadata(
557
+ name="logprob",
558
+ display_name="Log probability",
559
+ short_display_name="Logprob",
560
+ description="Predicted output's average log probability (input's log prob for language modeling).",
561
+ lower_is_better=False,
562
+ ),
563
+ MetricMetadata(
564
+ name="num_perplexity_tokens",
565
+ display_name="# tokens",
566
+ description="Average number of tokens in the predicted output (for language modeling, the input too).",
567
+ lower_is_better=None,
568
+ ),
569
+ MetricMetadata(
570
+ name="num_bytes",
571
+ display_name="# bytes",
572
+ description="Average number of bytes in the predicted output (for language modeling, the input too).",
573
+ lower_is_better=None,
574
+ ),
575
+ ]
576
+
577
+
390
578
  def _has_non_zero_valued_logprobs(per_instance_stats: Dict[Instance, List[Stat]]) -> bool:
391
579
  """Return whether the per-instance stats contain non-zero-valued logprobs.
392
580
 
@@ -448,3 +636,80 @@ def compute_calibration_metrics(per_instance_stats: Dict[Instance, List[Stat]])
448
636
  stats.append(Stat(MetricName("platt_ece_1_bin")).add(platt_ece_1_bin))
449
637
 
450
638
  return stats
639
+
640
+
641
+ def _get_calibration_metrics_metadata() -> List[MetricMetadata]:
642
+ return [
643
+ MetricMetadata(
644
+ name="ece_10_bin",
645
+ display_name="10-bin expected calibration error",
646
+ short_display_name="ECE (10-bin)",
647
+ description="The average difference between the model's confidence and accuracy, averaged across 10 "
648
+ "bins where each bin contains an equal number of points (only computed for classification "
649
+ "tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because "
650
+ "each bin will have very few examples.",
651
+ lower_is_better=True,
652
+ group="calibration",
653
+ ),
654
+ MetricMetadata(
655
+ name="ece_1_bin",
656
+ display_name="1-bin expected calibration error",
657
+ short_display_name="ECE (1-bin)",
658
+ description="The (absolute value) difference between the model's average confidence and accuracy "
659
+ "(only computed for classification tasks).",
660
+ lower_is_better=True,
661
+ group="calibration_detailed",
662
+ ),
663
+ MetricMetadata(
664
+ name="selective_acc@10",
665
+ display_name="Accuracy at 10% coverage",
666
+ short_display_name="Acc@10%",
667
+ description="The accuracy for the 10% of predictions that the model is most confident on (only "
668
+ "computed for classification tasks).",
669
+ lower_is_better=False,
670
+ group="calibration_detailed",
671
+ ),
672
+ MetricMetadata(
673
+ name="selective_cov_acc_area",
674
+ display_name="Selective coverage-accuracy area",
675
+ short_display_name="Selective Acc",
676
+ description="The area under the coverage-accuracy curve, a standard selective classification metric "
677
+ "(only computed for classification tasks).",
678
+ lower_is_better=False,
679
+ group="calibration_detailed",
680
+ ),
681
+ MetricMetadata(
682
+ name="platt_coef",
683
+ display_name="Platt Scaling Coefficient",
684
+ short_display_name="Platt Coef",
685
+ description="Coefficient of the Platt scaling classifier (can compare this across tasks).",
686
+ lower_is_better=False,
687
+ group="calibration_detailed",
688
+ ),
689
+ MetricMetadata(
690
+ name="platt_intercept",
691
+ display_name="Platt Scaling Intercept",
692
+ short_display_name="Platt Intercept",
693
+ description="Intercept of the Platt scaling classifier (can compare this across tasks).",
694
+ lower_is_better=False,
695
+ group="calibration_detailed",
696
+ ),
697
+ MetricMetadata(
698
+ name="platt_ece_10_bin",
699
+ display_name="10-bin Expected Calibration Error (after Platt scaling)",
700
+ short_display_name="Platt-scaled ECE (10-bin)",
701
+ description="10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted "
702
+ "probabilities.",
703
+ lower_is_better=True,
704
+ group="calibration_detailed",
705
+ ),
706
+ MetricMetadata(
707
+ name="platt_ece_1_bin",
708
+ display_name="1-bin expected calibration error (after Platt scaling)",
709
+ short_display_name="Platt-scaled ECE (1-bin)",
710
+ description="1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted "
711
+ "probabilities.",
712
+ lower_is_better=True,
713
+ group="calibration_detailed",
714
+ ),
715
+ ]
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
2
  from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
3
3
 
4
+ from helm.benchmark.metrics.metric import MetricMetadata
4
5
  from helm.common.request import RequestResult
5
6
  from helm.benchmark.adaptation.request_state import RequestState
6
7
  from helm.benchmark.metrics.metric_name import MetricName
@@ -145,3 +146,14 @@ class BBQMetric(EvaluateInstancesMetric):
145
146
  stats = [acc, amb_bias_stat, disamb_bias_stat]
146
147
 
147
148
  return stats
149
+
150
+ def get_metadata(self) -> List[MetricMetadata]:
151
+ return [
152
+ MetricMetadata(
153
+ name="bbq_accuracy",
154
+ display_name="BBQ accuracy",
155
+ description="BBQ accuracy",
156
+ lower_is_better=False,
157
+ group=None,
158
+ ),
159
+ ]
@@ -6,7 +6,7 @@ from sklearn.preprocessing import MultiLabelBinarizer
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
8
8
  from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
9
- from helm.benchmark.metrics.metric import MetricName
9
+ from helm.benchmark.metrics.metric import MetricMetadata, MetricName
10
10
  from helm.benchmark.metrics.statistic import Stat
11
11
  from helm.benchmark.scenarios.scenario import Reference
12
12
  from helm.common.hierarchical_logger import hwarn
@@ -168,3 +168,21 @@ class MultipleChoiceClassificationMetric(EvaluateInstancesMetric):
168
168
  Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
169
169
  Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
170
170
  ]
171
+
172
+ def get_metadata(self) -> List[MetricMetadata]:
173
+ return [
174
+ MetricMetadata(
175
+ name="classification_macro_f1",
176
+ display_name="Macro F1",
177
+ description="Macro F1",
178
+ lower_is_better=False,
179
+ group="classification_metrics",
180
+ ),
181
+ MetricMetadata(
182
+ name="classification_micro_f1",
183
+ display_name="Micro F1",
184
+ description="Population-level micro-averaged F1 score.",
185
+ lower_is_better=False,
186
+ group="classification_metrics",
187
+ ),
188
+ ]
@@ -3,7 +3,7 @@ from typing import Any, List
3
3
 
4
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
7
7
  from helm.benchmark.metrics.metric_name import MetricName
8
8
  from helm.benchmark.metrics.metric_service import MetricService
9
9
  from helm.benchmark.metrics.statistic import Stat
@@ -70,3 +70,14 @@ class ConvFinQACalcMetric(Metric):
70
70
  return [
71
71
  Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
72
72
  ]
73
+
74
+ def get_metadata(self) -> List[MetricMetadata]:
75
+ return [
76
+ MetricMetadata(
77
+ name="float_equiv",
78
+ display_name="Float Equivalence",
79
+ description="Float Equivalence",
80
+ lower_is_better=False,
81
+ group=None,
82
+ ),
83
+ ]
@@ -8,7 +8,7 @@ from helm.benchmark.adaptation.request_state import RequestState
8
8
  from helm.benchmark.metrics.statistic import Stat, merge_stat
9
9
  from helm.benchmark.window_services.window_service import WindowService
10
10
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
11
- from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
11
+ from helm.benchmark.metrics.metric import MetricInterface, MetricMetadata, MetricResult, PerInstanceStats
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.metrics.metric_service import MetricService
14
14
  from helm.benchmark.metrics.tokens.auto_token_cost_estimator import AutoTokenCostEstimator
@@ -93,3 +93,32 @@ class DryRunMetric(MetricInterface):
93
93
  merge_stat(stats, Stat(MetricName("num_requests")).add(len(scenario_state.request_states)))
94
94
 
95
95
  return MetricResult(list(stats.values()), per_instance_stats)
96
+
97
+ def get_metadata(self) -> List[MetricMetadata]:
98
+ return [
99
+ MetricMetadata(
100
+ name="estimated_num_tokens_cost",
101
+ display_name="cost",
102
+ short_display_name=None,
103
+ description="An estimate of the number of tokens (including prompt and output completions) needed to "
104
+ "perform the request.",
105
+ lower_is_better=None,
106
+ group=None,
107
+ ),
108
+ MetricMetadata(
109
+ name="num_completions",
110
+ display_name="# completions",
111
+ short_display_name=None,
112
+ description="Number of completions.",
113
+ lower_is_better=None,
114
+ group=None,
115
+ ),
116
+ MetricMetadata(
117
+ name="num_prompt_tokens",
118
+ display_name="# prompt tokens",
119
+ short_display_name=None,
120
+ description="Number of tokens in the prompt.",
121
+ lower_is_better=None,
122
+ group="general_information",
123
+ ),
124
+ ]
@@ -9,6 +9,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
9
9
  from helm.benchmark.window_services.window_service import WindowService
10
10
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
11
11
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
12
+ from helm.benchmark.metrics.metric import MetricMetadata
12
13
  from helm.benchmark.metrics.metric_name import MetricName
13
14
  from helm.benchmark.metrics.metric_service import MetricService
14
15
  from helm.benchmark.metrics.statistic import Stat
@@ -162,6 +163,79 @@ class EfficiencyMetric:
162
163
  stats.append(Stat(MetricName("inference_idealized_runtime")).add(idealized_runtime))
163
164
  return stats
164
165
 
166
+ def get_metadata(self) -> List[MetricMetadata]:
167
+ return [
168
+ MetricMetadata(
169
+ name="num_prompt_tokens",
170
+ display_name="# prompt tokens",
171
+ short_display_name=None,
172
+ description="Number of tokens in the prompt.",
173
+ lower_is_better=None,
174
+ group="general_information",
175
+ ),
176
+ MetricMetadata(
177
+ name="num_completion_tokens",
178
+ display_name="# completion tokens",
179
+ description="Actual number of completion tokens (over all completions).",
180
+ lower_is_better=None,
181
+ ),
182
+ MetricMetadata(
183
+ name="num_output_tokens",
184
+ display_name="# output tokens",
185
+ description="Actual number of output tokens.",
186
+ lower_is_better=None,
187
+ ),
188
+ MetricMetadata(
189
+ name="training_co2_cost",
190
+ display_name="Estimated training emissions (kg CO2)",
191
+ short_display_name="Training emissions (kg CO2)",
192
+ description="Estimate of the CO2 emissions from training the model.",
193
+ lower_is_better=True,
194
+ group="efficiency_detailed",
195
+ ),
196
+ MetricMetadata(
197
+ name="training_energy_cost",
198
+ display_name="Estimated training energy cost (MWh)",
199
+ short_display_name="Training energy (MWh)",
200
+ description="Estimate of the amount of energy used to train the model.",
201
+ lower_is_better=True,
202
+ group="efficiency_detailed",
203
+ ),
204
+ MetricMetadata(
205
+ name="inference_runtime",
206
+ display_name="Observed inference runtime (s)",
207
+ short_display_name="Observed inference time (s)",
208
+ description="Average observed time to process a request to the model (via an API, and thus depends on "
209
+ "particular deployment).",
210
+ lower_is_better=True,
211
+ group="efficiency_detailed",
212
+ ),
213
+ MetricMetadata(
214
+ name="batch_size",
215
+ display_name="Batch size",
216
+ description="For batch jobs, how many requests are in a batch.",
217
+ lower_is_better=None,
218
+ ),
219
+ MetricMetadata(
220
+ name="inference_denoised_runtime",
221
+ display_name="Denoised inference runtime (s)",
222
+ short_display_name="Denoised inference time (s)",
223
+ description="Average time to process a request to the model minus performance contention by using "
224
+ "profiled runtimes from multiple trials of SyntheticEfficiencyScenario.",
225
+ lower_is_better=True,
226
+ group="efficiency_detailed",
227
+ ),
228
+ MetricMetadata(
229
+ name="inference_idealized_runtime",
230
+ display_name="Idealized inference runtime (s)",
231
+ short_display_name="Idealized inference time (s)",
232
+ description="Average time to process a request to the model based solely on the model architecture "
233
+ "(using Megatron-LM).",
234
+ lower_is_better=True,
235
+ group="efficiency_detailed",
236
+ ),
237
+ ]
238
+
165
239
 
166
240
  def _compute_estimated_time_from_prompt_size_and_num_output_tokens(
167
241
  request_state: RequestState,
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from helm.benchmark.metrics.metric import Metric
4
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
5
5
  from helm.benchmark.metrics.metric_name import MetricName
6
6
  from helm.benchmark.metrics.metric_service import MetricService
7
7
  from helm.benchmark.metrics.statistic import Stat
@@ -101,3 +101,59 @@ class EhrSqlMetric(Metric):
101
101
  Stat(MetricName("ehr_sql_total_predicted_answerable")).add(int(is_predicted_answerable)),
102
102
  Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(int(is_answerable)),
103
103
  ]
104
+
105
+ def get_metadata(self) -> List[MetricMetadata]:
106
+ return [
107
+ MetricMetadata(
108
+ name="ehr_sql_execution_accuracy",
109
+ display_name="Execution accuracy for Generated Query",
110
+ short_display_name="EHRSQLExeAcc",
111
+ description="Measures the proportion of correctly predicted answerable questions among all questions "
112
+ "predicted to be answerable.",
113
+ lower_is_better=False,
114
+ group=None,
115
+ ),
116
+ MetricMetadata(
117
+ name="ehr_sql_query_validity",
118
+ display_name="Validity of Generated Query",
119
+ short_display_name="EHRSQLQueryValid",
120
+ description="Measures the proportion of correctly predicted answerable questions among all answerable "
121
+ "questions in the dataset.",
122
+ lower_is_better=False,
123
+ group=None,
124
+ ),
125
+ MetricMetadata(
126
+ name="ehr_sql_precision_answerable",
127
+ display_name="Precision for Answerable Questions",
128
+ short_display_name="EHRSQLPreAns",
129
+ description="Measures the proportion of correctly predicted answerable questions among all questions "
130
+ "predicted to be answerable.",
131
+ lower_is_better=False,
132
+ group=None,
133
+ ),
134
+ MetricMetadata(
135
+ name="ehr_sql_recall_answerable",
136
+ display_name="Recall for Answerable Questions",
137
+ short_display_name="EHRSQLReAns",
138
+ description="Measures the proportion of correctly predicted answerable questions among all answerable "
139
+ "questions in the dataset.",
140
+ lower_is_better=False,
141
+ group=None,
142
+ ),
143
+ MetricMetadata(
144
+ name="ehr_sql_total_predicted_answerable",
145
+ display_name="Total Predicted Answerable",
146
+ short_display_name="Total Pred Ans",
147
+ description="Total number of questions predicted to be answerable by the model.",
148
+ lower_is_better=False,
149
+ group=None,
150
+ ),
151
+ MetricMetadata(
152
+ name="ehr_sql_total_ground_truth_answerable",
153
+ display_name="Total Ground Truth Answerable",
154
+ short_display_name="Total GT Ans",
155
+ description="Total number of answerable questions in the ground truth.",
156
+ lower_is_better=False,
157
+ group=None,
158
+ ),
159
+ ]