crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -9,6 +9,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
9
9
  from helm.benchmark.window_services.window_service import WindowService
10
10
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
11
11
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
12
+ from helm.benchmark.metrics.metric import MetricMetadata
12
13
  from helm.benchmark.metrics.metric_name import MetricName
13
14
  from helm.benchmark.metrics.metric_service import MetricService
14
15
  from helm.benchmark.metrics.statistic import Stat
@@ -162,6 +163,79 @@ class EfficiencyMetric:
162
163
  stats.append(Stat(MetricName("inference_idealized_runtime")).add(idealized_runtime))
163
164
  return stats
164
165
 
166
+ def get_metadata(self) -> List[MetricMetadata]:
167
+ return [
168
+ MetricMetadata(
169
+ name="num_prompt_tokens",
170
+ display_name="# prompt tokens",
171
+ short_display_name=None,
172
+ description="Number of tokens in the prompt.",
173
+ lower_is_better=None,
174
+ group="general_information",
175
+ ),
176
+ MetricMetadata(
177
+ name="num_completion_tokens",
178
+ display_name="# completion tokens",
179
+ description="Actual number of completion tokens (over all completions).",
180
+ lower_is_better=None,
181
+ ),
182
+ MetricMetadata(
183
+ name="num_output_tokens",
184
+ display_name="# output tokens",
185
+ description="Actual number of output tokens.",
186
+ lower_is_better=None,
187
+ ),
188
+ MetricMetadata(
189
+ name="training_co2_cost",
190
+ display_name="Estimated training emissions (kg CO2)",
191
+ short_display_name="Training emissions (kg CO2)",
192
+ description="Estimate of the CO2 emissions from training the model.",
193
+ lower_is_better=True,
194
+ group="efficiency_detailed",
195
+ ),
196
+ MetricMetadata(
197
+ name="training_energy_cost",
198
+ display_name="Estimated training energy cost (MWh)",
199
+ short_display_name="Training energy (MWh)",
200
+ description="Estimate of the amount of energy used to train the model.",
201
+ lower_is_better=True,
202
+ group="efficiency_detailed",
203
+ ),
204
+ MetricMetadata(
205
+ name="inference_runtime",
206
+ display_name="Observed inference runtime (s)",
207
+ short_display_name="Observed inference time (s)",
208
+ description="Average observed time to process a request to the model (via an API, and thus depends on "
209
+ "particular deployment).",
210
+ lower_is_better=True,
211
+ group="efficiency_detailed",
212
+ ),
213
+ MetricMetadata(
214
+ name="batch_size",
215
+ display_name="Batch size",
216
+ description="For batch jobs, how many requests are in a batch.",
217
+ lower_is_better=None,
218
+ ),
219
+ MetricMetadata(
220
+ name="inference_denoised_runtime",
221
+ display_name="Denoised inference runtime (s)",
222
+ short_display_name="Denoised inference time (s)",
223
+ description="Average time to process a request to the model minus performance contention by using "
224
+ "profiled runtimes from multiple trials of SyntheticEfficiencyScenario.",
225
+ lower_is_better=True,
226
+ group="efficiency_detailed",
227
+ ),
228
+ MetricMetadata(
229
+ name="inference_idealized_runtime",
230
+ display_name="Idealized inference runtime (s)",
231
+ short_display_name="Idealized inference time (s)",
232
+ description="Average time to process a request to the model based solely on the model architecture "
233
+ "(using Megatron-LM).",
234
+ lower_is_better=True,
235
+ group="efficiency_detailed",
236
+ ),
237
+ ]
238
+
165
239
 
166
240
  def _compute_estimated_time_from_prompt_size_and_num_output_tokens(
167
241
  request_state: RequestState,
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from helm.benchmark.metrics.metric import Metric
4
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
5
5
  from helm.benchmark.metrics.metric_name import MetricName
6
6
  from helm.benchmark.metrics.metric_service import MetricService
7
7
  from helm.benchmark.metrics.statistic import Stat
@@ -101,3 +101,59 @@ class EhrSqlMetric(Metric):
101
101
  Stat(MetricName("ehr_sql_total_predicted_answerable")).add(int(is_predicted_answerable)),
102
102
  Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(int(is_answerable)),
103
103
  ]
104
+
105
+ def get_metadata(self) -> List[MetricMetadata]:
106
+ return [
107
+ MetricMetadata(
108
+ name="ehr_sql_execution_accuracy",
109
+ display_name="Execution accuracy for Generated Query",
110
+ short_display_name="EHRSQLExeAcc",
111
+ description="Measures the proportion of correctly predicted answerable questions among all questions "
112
+ "predicted to be answerable.",
113
+ lower_is_better=False,
114
+ group=None,
115
+ ),
116
+ MetricMetadata(
117
+ name="ehr_sql_query_validity",
118
+ display_name="Validity of Generated Query",
119
+ short_display_name="EHRSQLQueryValid",
120
+ description="Measures the proportion of correctly predicted answerable questions among all answerable "
121
+ "questions in the dataset.",
122
+ lower_is_better=False,
123
+ group=None,
124
+ ),
125
+ MetricMetadata(
126
+ name="ehr_sql_precision_answerable",
127
+ display_name="Precision for Answerable Questions",
128
+ short_display_name="EHRSQLPreAns",
129
+ description="Measures the proportion of correctly predicted answerable questions among all questions "
130
+ "predicted to be answerable.",
131
+ lower_is_better=False,
132
+ group=None,
133
+ ),
134
+ MetricMetadata(
135
+ name="ehr_sql_recall_answerable",
136
+ display_name="Recall for Answerable Questions",
137
+ short_display_name="EHRSQLReAns",
138
+ description="Measures the proportion of correctly predicted answerable questions among all answerable "
139
+ "questions in the dataset.",
140
+ lower_is_better=False,
141
+ group=None,
142
+ ),
143
+ MetricMetadata(
144
+ name="ehr_sql_total_predicted_answerable",
145
+ display_name="Total Predicted Answerable",
146
+ short_display_name="Total Pred Ans",
147
+ description="Total number of questions predicted to be answerable by the model.",
148
+ lower_is_better=False,
149
+ group=None,
150
+ ),
151
+ MetricMetadata(
152
+ name="ehr_sql_total_ground_truth_answerable",
153
+ display_name="Total Ground Truth Answerable",
154
+ short_display_name="Total GT Ans",
155
+ description="Total number of answerable questions in the ground truth.",
156
+ lower_is_better=False,
157
+ group=None,
158
+ ),
159
+ ]
@@ -14,6 +14,7 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
14
  from helm.benchmark.adaptation.request_state import RequestState
15
15
  from helm.benchmark.metrics import code_metrics_helper
16
16
  from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
17
+ from helm.benchmark.metrics.metric import MetricMetadata
17
18
  from helm.benchmark.metrics.metric_name import MetricName
18
19
  from helm.benchmark.metrics.metric_service import MetricService
19
20
  from helm.benchmark.metrics.nltk_helper import install_nltk_resources
@@ -36,7 +37,7 @@ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
36
37
  """
37
38
  if n - c < k:
38
39
  return 1.0
39
- return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
40
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)).item()
40
41
 
41
42
 
42
43
  def normalize_text(text: str, should_remove_articles: bool = True) -> str:
@@ -396,6 +397,16 @@ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
396
397
  return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore
397
398
 
398
399
 
400
+ def _apply_output_mapping_pattern(pattern: str, prediction: str) -> str:
401
+ match = re.search(pattern, prediction)
402
+ if not match:
403
+ return ""
404
+ elif match.groups():
405
+ return match.group(0)
406
+ else:
407
+ return match.string
408
+
409
+
399
410
  # TODO This should probably be made into an implementation of MetricInterface. For now it lives here
400
411
  # just to separate it from basic_metrics.py.
401
412
  def compute_reference_metrics(
@@ -497,6 +508,8 @@ def compute_reference_metrics(
497
508
  # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
498
509
  # sometimes predict a random letter like 'M'.
499
510
  if request_state.output_mapping is not None:
511
+ if adapter_spec.output_mapping_pattern:
512
+ preds = [_apply_output_mapping_pattern(adapter_spec.output_mapping_pattern, pred) for pred in preds]
500
513
  preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore
501
514
 
502
515
  # Compute max_prob, the probability that the model assigns to its generated text.
@@ -518,3 +531,301 @@ def compute_reference_metrics(
518
531
  raise NameError(f"{metric_name} is not in the list of metric functions.")
519
532
 
520
533
  return stats
534
+
535
+
536
+ _METRIC_METADATA_MAPPING: Dict[str, MetricMetadata] = {
537
+ "exact_match": MetricMetadata(
538
+ name="exact_match",
539
+ display_name="Exact match",
540
+ short_display_name="EM",
541
+ description="Fraction of instances that the predicted output matches a correct reference exactly.",
542
+ lower_is_better=False,
543
+ group="accuracy",
544
+ ),
545
+ "quasi_exact_match": MetricMetadata(
546
+ name="quasi_exact_match",
547
+ display_name="Quasi-exact match",
548
+ short_display_name="EM",
549
+ description="Fraction of instances that the predicted output matches a correct reference up to light "
550
+ "processing.",
551
+ lower_is_better=False,
552
+ group=None,
553
+ ),
554
+ "quasi_leave_articles_exact_match": MetricMetadata(
555
+ name="quasi_leave_articles_exact_match",
556
+ display_name="Quasi-exact match",
557
+ short_display_name="EM",
558
+ description="Fraction of instances that the predicted output matches a correct reference up to light "
559
+ "processing.",
560
+ lower_is_better=False,
561
+ group=None,
562
+ ),
563
+ "prefix_exact_match": MetricMetadata(
564
+ name="prefix_exact_match",
565
+ display_name="Prefix exact match",
566
+ short_display_name="PEM",
567
+ description="Fraction of instances that the predicted output matches the prefix of a correct reference "
568
+ "exactly.",
569
+ lower_is_better=False,
570
+ group=None,
571
+ ),
572
+ "quasi_prefix_exact_match": MetricMetadata(
573
+ name="quasi_prefix_exact_match",
574
+ display_name="Prefix quasi-exact match",
575
+ short_display_name="PEM",
576
+ description="Fraction of instances that the predicted output matches the prefix of a correct reference "
577
+ "up to light processing.",
578
+ lower_is_better=False,
579
+ group=None,
580
+ ),
581
+ "exact_match_indicator": MetricMetadata(
582
+ name="exact_match_indicator",
583
+ display_name="Exact match (final)",
584
+ short_display_name="EM",
585
+ description="Fraction of instances that the predicted output matches a correct reference exactly, "
586
+ "ignoring text preceding the specified indicator (e.g., space).",
587
+ lower_is_better=False,
588
+ group=None,
589
+ ),
590
+ "final_number_exact_match": MetricMetadata(
591
+ name="final_number_exact_match",
592
+ display_name="Exact match (final number)",
593
+ short_display_name="EM",
594
+ description="Fraction of instances that the predicted output matches a correct reference exactly, "
595
+ "ignoring text preceding the specified indicator.",
596
+ lower_is_better=False,
597
+ group=None,
598
+ ),
599
+ "exact_set_match": MetricMetadata(
600
+ name="exact_set_match",
601
+ display_name="Exact match (at sets)",
602
+ short_display_name="EM",
603
+ description="Fraction of instances that the predicted output matches a correct reference exactly as " "sets.",
604
+ lower_is_better=False,
605
+ group=None,
606
+ ),
607
+ "iou_set_match": MetricMetadata(
608
+ name="iou_set_match",
609
+ display_name="Intersection over union (as sets)",
610
+ short_display_name="IoU",
611
+ description="Intersection over union in terms of set overlap between the model predicted set and "
612
+ "correct reference set.",
613
+ lower_is_better=False,
614
+ group=None,
615
+ ),
616
+ "f1_set_match": MetricMetadata(
617
+ name="f1_set_match",
618
+ display_name="F1 (set match)",
619
+ short_display_name="F1",
620
+ description="Average F1 score in terms of set overlap between the model predicted set and correct "
621
+ "reference set.",
622
+ lower_is_better=False,
623
+ group=None,
624
+ ),
625
+ "math_equiv": MetricMetadata(
626
+ name="math_equiv",
627
+ display_name="Equivalent",
628
+ description="Fraction of model outputs that are mathematically equivalent to the correct reference.",
629
+ lower_is_better=False,
630
+ group=None,
631
+ ),
632
+ "math_equiv_chain_of_thought": MetricMetadata(
633
+ name="math_equiv_chain_of_thought",
634
+ display_name="Equivalent (CoT)",
635
+ description="Fraction of model outputs that are mathematically equivalent to the correct reference "
636
+ "when using chain-of-thought prompting.",
637
+ lower_is_better=False,
638
+ group=None,
639
+ ),
640
+ "code_eval_acc": MetricMetadata(
641
+ name="code_eval_acc",
642
+ display_name="Correctness",
643
+ short_display_name="Correctness",
644
+ description="Fraction of instances that the model output evaluates to the correct answer.",
645
+ lower_is_better=False,
646
+ group=None,
647
+ ),
648
+ "pass": MetricMetadata(
649
+ name="pass",
650
+ display_name="pass@1",
651
+ description="Fraction of model outputs that pass the associated test cases.",
652
+ lower_is_better=False,
653
+ group=None,
654
+ ),
655
+ "cider": MetricMetadata(
656
+ name="cider",
657
+ display_name="CIDEr",
658
+ description="Evaluates the quality of generated caption by measuring the weighted similarity of "
659
+ "n-grams between the captions and a set of human-written reference captions, emphasizing "
660
+ "informativeness and consensus.",
661
+ lower_is_better=False,
662
+ group=None,
663
+ ),
664
+ "f1_score": MetricMetadata(
665
+ name="f1_score",
666
+ display_name="F1",
667
+ description="Average F1 score in terms of word overlap between the model output and correct reference.",
668
+ lower_is_better=False,
669
+ group=None,
670
+ ),
671
+ "rouge_1": MetricMetadata(
672
+ name="rouge_1",
673
+ display_name="ROUGE-1",
674
+ description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram " "overlap.",
675
+ lower_is_better=False,
676
+ group=None,
677
+ ),
678
+ "rouge_2": MetricMetadata(
679
+ name="rouge_2",
680
+ display_name="ROUGE-2",
681
+ description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram " "overlap.",
682
+ lower_is_better=False,
683
+ group=None,
684
+ ),
685
+ "rouge_l": MetricMetadata(
686
+ name="rouge_l",
687
+ display_name="ROUGE-L",
688
+ description="Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest "
689
+ "common subsequence overlap.",
690
+ lower_is_better=False,
691
+ group=None,
692
+ ),
693
+ "bleu_1": MetricMetadata(
694
+ name="bleu_1",
695
+ display_name="BLEU-1",
696
+ description="Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on "
697
+ "1-gram overlap.",
698
+ lower_is_better=False,
699
+ group=None,
700
+ ),
701
+ "bleu_4": MetricMetadata(
702
+ name="bleu_4",
703
+ display_name="BLEU-4",
704
+ description="Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on "
705
+ "4-gram overlap.",
706
+ lower_is_better=False,
707
+ group=None,
708
+ ),
709
+ "chinese_bleu_1": MetricMetadata(
710
+ name="chinese_bleu_1",
711
+ display_name="Chinese BLEU-1 score",
712
+ short_display_name="BLEU-1 (Chinese)",
713
+ description="BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a "
714
+ "Chinese tokenizer that segments Chinese strings by character.",
715
+ lower_is_better=False,
716
+ group=None,
717
+ # Group could be one of:
718
+ # "cleva_pinyin_transliteration_metrics"
719
+ # "cleva_dialogue_generation_metrics"
720
+ # "cleva_data_to_text_generation_metrics"
721
+ ),
722
+ "chinese_rouge_1": MetricMetadata(
723
+ name="chinese_rouge_1",
724
+ display_name="Chinese ROUGE-1 score",
725
+ short_display_name="ROUGE-1 (Chinese)",
726
+ description="ROUGE-1 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese "
727
+ "tokenizer that segments Chinese strings by character.",
728
+ lower_is_better=False,
729
+ group="cleva_summarization_metrics",
730
+ ),
731
+ "chinese_rouge_2": MetricMetadata(
732
+ name="chinese_rouge_2",
733
+ display_name="Chinese ROUGE-2 score",
734
+ short_display_name="ROUGE-2 (Chinese)",
735
+ description="ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese "
736
+ "tokenizer that segments Chinese strings by character.",
737
+ lower_is_better=False,
738
+ group="cleva_summarization_metrics",
739
+ ),
740
+ "cleva_math_result_match": MetricMetadata(
741
+ name="cleva_math_result_match",
742
+ display_name="CLEVA Math Exact Match",
743
+ short_display_name="EM (Math)",
744
+ description="Exact match that cares only the last math expression (numbers and fractions) in the "
745
+ "model's prediction.",
746
+ lower_is_better=False,
747
+ group="cleva_mathematical_reasoning_metrics",
748
+ ),
749
+ "absolute_value_difference": MetricMetadata(
750
+ name="absolute_value_difference",
751
+ display_name="Absolute difference",
752
+ short_display_name="Diff.",
753
+ description="Average absolute difference between the model output (converted to a number) and the "
754
+ "correct reference.",
755
+ lower_is_better=True,
756
+ group=None,
757
+ ),
758
+ "wer_score": MetricMetadata(
759
+ name="wer_score",
760
+ display_name="Word Error Rate",
761
+ short_display_name="WER",
762
+ description="Word error rate between model predictions and ground truth answers for ASR tasks.",
763
+ lower_is_better=True,
764
+ group=None,
765
+ ),
766
+ "mer_score": MetricMetadata(
767
+ name="mer_score",
768
+ display_name="Match Error Rate",
769
+ short_display_name="MER",
770
+ description="Word match error rate between model predictions and ground truth answers.",
771
+ lower_is_better=True,
772
+ group=None,
773
+ ),
774
+ "wip_score": MetricMetadata(
775
+ name="wip_score",
776
+ display_name="Word Information Preservation",
777
+ short_display_name="WIP",
778
+ description="Word information preservation (WIP) for evaluating the preserved information of ASR.",
779
+ lower_is_better=False,
780
+ group=None,
781
+ ),
782
+ "cer_score": MetricMetadata(
783
+ name="cer_score",
784
+ display_name="Character Error Rate",
785
+ short_display_name="CER",
786
+ description="Character error rate (CER) for evaluating the accuracy of ASR.",
787
+ lower_is_better=True,
788
+ group=None,
789
+ ),
790
+ "chinese_wer_score": MetricMetadata(
791
+ name="chinese_wer_score",
792
+ display_name="Chinese Word Error Rate",
793
+ short_display_name="Chinese WER",
794
+ description="Chinese word error rate between model predictions and ground truth answers for ASR tasks.",
795
+ lower_is_better=True,
796
+ group=None,
797
+ ),
798
+ "chinese_mer_score": MetricMetadata(
799
+ name="chinese_mer_score",
800
+ display_name="Chinese Match Error Rate",
801
+ short_display_name="Chinese MER",
802
+ description="Chinese word match error rate between model predictions and ground truth answers.",
803
+ lower_is_better=True,
804
+ group=None,
805
+ ),
806
+ "chinese_wip_score": MetricMetadata(
807
+ name="chinese_wip_score",
808
+ display_name="Chinese Word Information Preservation",
809
+ short_display_name="Chinese WIP",
810
+ description="Chinese word information preservation (WIP) for evaluating the preserved information of " "ASR.",
811
+ lower_is_better=False,
812
+ group=None,
813
+ ),
814
+ "chinese_cer_score": MetricMetadata(
815
+ name="chinese_cer_score",
816
+ display_name="Chinese Character Error Rate",
817
+ short_display_name="Chinese CER",
818
+ description="Chinese character error rate (CER) for evaluating the accuracy of Chiese ASR.",
819
+ lower_is_better=True,
820
+ group=None,
821
+ ),
822
+ }
823
+
824
+
825
+ def get_reference_metrics_metadata(names: List[str]) -> List[MetricMetadata]:
826
+ metadata_list: List[MetricMetadata] = []
827
+ for name in names:
828
+ metadata = _METRIC_METADATA_MAPPING.get(name)
829
+ if metadata:
830
+ metadata_list.append(metadata)
831
+ return metadata_list
@@ -3,7 +3,7 @@ from typing import List, Optional
3
3
 
4
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
7
7
  from helm.benchmark.metrics.metric_name import MetricName
8
8
  from helm.benchmark.metrics.metric_service import MetricService
9
9
  from helm.benchmark.metrics.statistic import Stat
@@ -101,3 +101,15 @@ class GPQAChainOfThoughtMetric(Metric):
101
101
  # Compare extracted answer with the correct answer and compute the score
102
102
  score = 1 if extracted_answer == correct_answer else 0
103
103
  return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
104
+
105
+ def get_metadata(self) -> List[MetricMetadata]:
106
+ return [
107
+ MetricMetadata(
108
+ name="chain_of_thought_correctness",
109
+ display_name="COT correct",
110
+ short_display_name="COT correct",
111
+ description="Fraction of correct answers after chain of thought",
112
+ lower_is_better=False,
113
+ group="accuracy",
114
+ ),
115
+ ]
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -34,3 +34,15 @@ class HelpdeskCallSummarizationMetric(Metric):
34
34
  return [
35
35
  Stat(MetricName("call_summarization_score")).add(score),
36
36
  ]
37
+
38
+ def get_metadata(self) -> List[MetricMetadata]:
39
+ return [
40
+ MetricMetadata(
41
+ name="call_summarization_score",
42
+ display_name="Score",
43
+ short_display_name="Score",
44
+ description="Score",
45
+ lower_is_better=False,
46
+ group="summarization_metrics",
47
+ ),
48
+ ]
@@ -3,7 +3,7 @@ from typing import List
3
3
  from helm.common.hierarchical_logger import hwarn
4
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
7
7
  from helm.benchmark.metrics.metric_name import MetricName
8
8
  from helm.benchmark.metrics.metric_service import MetricService
9
9
  from helm.benchmark.metrics.statistic import Stat
@@ -53,3 +53,15 @@ class IFEvalMetric(Metric):
53
53
  is_following_list.append(0)
54
54
 
55
55
  return [Stat(MetricName("ifeval_strict_accuracy")).add(sum(is_following_list) / len(is_following_list))]
56
+
57
+ def get_metadata(self) -> List[MetricMetadata]:
58
+ return [
59
+ MetricMetadata(
60
+ name="ifeval_strict_accuracy",
61
+ display_name="IFEval strict accuracy",
62
+ short_display_name="IFEval Strict Acc",
63
+ description="Fraction of instructions in the instance that are correctly followed.",
64
+ lower_is_better=False,
65
+ group="accuracy",
66
+ ),
67
+ ]
@@ -10,7 +10,7 @@ from helm.benchmark.metrics.statistic import Stat
10
10
  from helm.benchmark.metrics.metric import Metric
11
11
  from helm.benchmark.metrics.metric_name import MetricName
12
12
  from helm.benchmark.metrics.metric_service import MetricService
13
- from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
13
+ from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
14
14
  from helm.common.images_utils import is_blacked_out_image
15
15
  from helm.common.multimodal_request_utils import gather_generated_image_locations
16
16
 
@@ -55,7 +55,18 @@ class CLIPScoreMetric(Metric):
55
55
  # Truncate the prompt using the CLIP tokenizer before feeding into the CLIP model.
56
56
  # Otherwise, the library will throw an error.
57
57
  model = DEFAULT_CLIP_SCORE_MODEL
58
- prompt = WindowServiceFactory.get_window_service(model, metric_service).truncate_from_right(prompt)
58
+
59
+ # The max length is 77, but we also need to account for <|startoftext|> and <|endoftext|>.
60
+ # This max length is hardcoded for DEFAULT_CLIP_SCORE_MODEL i.e. openai/clip-vit-large-patch14
61
+ max_sequence_length = 77 - 2
62
+ prompt = CLIPWindowService(
63
+ service=metric_service,
64
+ tokenizer_name=DEFAULT_CLIP_SCORE_MODEL,
65
+ max_sequence_length=max_sequence_length,
66
+ max_request_length=max_sequence_length,
67
+ end_of_text_token="",
68
+ prefix_token="",
69
+ ).truncate_from_right(prompt)
59
70
 
60
71
  scores: List[float] = []
61
72
  image_locations: List[str] = gather_generated_image_locations(request_result)
@@ -58,6 +58,6 @@ def compute_fractal_dimension(image_path: str) -> float:
58
58
  except ModuleNotFoundError as e:
59
59
  handle_module_not_found_error(e, ["heim"])
60
60
 
61
- image = cv2.imread(image_path, 0) / 255.0 # type: ignore
61
+ image: np.ndarray = cv2.imread(image_path, 0) / 255.0 # type: ignore
62
62
  assert image.min() >= 0 and image.max() <= 1
63
63
  return fractal_dimension(image)
@@ -3,7 +3,7 @@ from typing import Dict, List
3
3
 
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
5
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
7
7
  from helm.benchmark.metrics.metric_name import MetricName
8
8
  from helm.benchmark.metrics.metric_service import MetricService
9
9
  from helm.benchmark.metrics.statistic import Stat
@@ -189,3 +189,43 @@ class InstructionFollowingCritiqueMetric(Metric):
189
189
 
190
190
  stats[answer_name].add(answer_value)
191
191
  return list(stats.values())
192
+
193
+ def get_metadata(self) -> List[MetricMetadata]:
194
+ return [
195
+ MetricMetadata(
196
+ name="Helpfulness",
197
+ display_name="Helpfulness",
198
+ description="Whether the model appears to do what it is instructed to.",
199
+ lower_is_better=False,
200
+ group="instruction_following_metrics",
201
+ ),
202
+ MetricMetadata(
203
+ name="Understandability",
204
+ display_name="Understandability",
205
+ description="Whether the answer is easy to comprehend.",
206
+ lower_is_better=False,
207
+ group="instruction_following_metrics",
208
+ ),
209
+ MetricMetadata(
210
+ name="Completeness",
211
+ display_name="Completeness",
212
+ description="Whether the response provides a comprehensive answer to the question or request.",
213
+ lower_is_better=False,
214
+ group="instruction_following_metrics",
215
+ ),
216
+ MetricMetadata(
217
+ name="Conciseness",
218
+ display_name="Conciseness",
219
+ description="Whether the response is concise and to the point, without unnecessary repetition or "
220
+ "elaboration.",
221
+ lower_is_better=False,
222
+ group="instruction_following_metrics",
223
+ ),
224
+ MetricMetadata(
225
+ name="Harmlessness",
226
+ display_name="Harmlessness",
227
+ description="Whether the response avoids using toxic language or helping the user with harmful goals.",
228
+ lower_is_better=False,
229
+ group="instruction_following_metrics",
230
+ ),
231
+ ]