crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
11
11
 
12
12
  from helm.benchmark.adaptation.request_state import RequestState
13
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
- from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
14
+ from helm.benchmark.metrics.metric import MetricInterface, MetricMetadata, MetricResult, PerInstanceStats, add_context
15
15
  from helm.benchmark.metrics.metric_name import MetricContext, MetricName
16
16
  from helm.benchmark.metrics.metric_service import MetricService
17
17
  from helm.benchmark.metrics.statistic import Stat, merge_stat
@@ -97,3 +97,15 @@ class LanguageModelingMetric(MetricInterface):
97
97
  derived_stats: List[Stat] = []
98
98
  derived_stats.extend(compute_perplexity_metrics(stats_dict))
99
99
  return derived_stats
100
+
101
+ def get_metadata(self) -> List[MetricMetadata]:
102
+ return [
103
+ MetricMetadata(
104
+ name="num_instances",
105
+ display_name="# eval",
106
+ short_display_name=None,
107
+ description="Number of evaluation instances.",
108
+ lower_is_better=None,
109
+ group="general_information",
110
+ ),
111
+ ]
@@ -2,7 +2,7 @@ from typing import List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -21,3 +21,15 @@ class LiveQAScoreMetric(Metric):
21
21
  assert request_state.annotations
22
22
  score = request_state.annotations["live_qa"]["score"]
23
23
  return [Stat(MetricName("live_qa_score")).add(score)]
24
+
25
+ def get_metadata(self) -> List[MetricMetadata]:
26
+ return [
27
+ MetricMetadata(
28
+ name="live_qa_score",
29
+ display_name="Judge Score",
30
+ short_display_name=None,
31
+ description="LLM-as-judge score",
32
+ lower_is_better=False,
33
+ group=None,
34
+ ),
35
+ ]
@@ -3,7 +3,7 @@ from typing import Any, Dict, List
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
5
  from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
6
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
7
7
  from helm.benchmark.metrics.metric_name import MetricName
8
8
  from helm.benchmark.metrics.metric_service import MetricService
9
9
  from helm.benchmark.metrics.statistic import Stat
@@ -44,3 +44,15 @@ class LLMJuryMetric(Metric):
44
44
  return [
45
45
  Stat(MetricName(self.metric_name)).add(score),
46
46
  ]
47
+
48
+ def get_metadata(self) -> List[MetricMetadata]:
49
+ return [
50
+ MetricMetadata(
51
+ name=self.metric_name,
52
+ display_name="Jury Score",
53
+ short_display_name="Jury Score",
54
+ description="Measures the average score assigned by an LLM-based jury evaluating task performance.",
55
+ lower_is_better=False,
56
+ group=None,
57
+ ),
58
+ ]
@@ -0,0 +1,12 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.metrics.metric import MetricSpec
4
+
5
+
6
+ def get_semantic_similarity_metric_specs(similarity_fn_name: str = "cosine") -> List[MetricSpec]:
7
+ return [
8
+ MetricSpec(
9
+ class_name="helm.benchmark.metrics.lmkt_metrics.SemanticSimilarityMetric",
10
+ args={"similarity_fn_name": similarity_fn_name},
11
+ ),
12
+ ]
@@ -0,0 +1,47 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+
13
+ class SemanticSimilarityMetric(Metric):
14
+ """Score metrics for LMKT semantic similarity measurement."""
15
+
16
+ def __init__(self, similarity_fn_name: str = "cosine"):
17
+ """
18
+ Initialize the SemanticSimilarityMetric with a SentenceTransformer model.
19
+ :param similarity_fn_name: The name of the similarity function to use.
20
+ Available options are "dot", "cosine", "manhattan" and "euclidean".
21
+ """
22
+ super().__init__()
23
+
24
+ self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", similarity_fn_name=similarity_fn_name)
25
+
26
+ def evaluate_generation(
27
+ self,
28
+ adapter_spec: AdapterSpec,
29
+ request_state: RequestState,
30
+ metric_service: MetricService,
31
+ eval_cache_path: str,
32
+ ) -> List[Stat]:
33
+
34
+ assert request_state.result
35
+
36
+ completions = [c.text for c in request_state.result.completions]
37
+ completion_embeddings = self.model.encode(completions)
38
+
39
+ references = [r.output.text for r in request_state.instance.references if CORRECT_TAG in r.tags]
40
+ reference_embeddings = self.model.encode(references)
41
+
42
+ similarities = self.model.similarity(completion_embeddings, reference_embeddings)
43
+ avg_similarity = similarities.mean().item()
44
+
45
+ return [
46
+ Stat(MetricName("semantic_similarity")).add(avg_similarity),
47
+ ]
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
  from typing import List, Dict, Any
5
5
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
- from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
8
8
  from helm.benchmark.metrics.metric_name import MetricName
9
9
  from helm.benchmark.metrics.metric_service import MetricService
10
10
  from helm.benchmark.metrics.statistic import Stat
@@ -122,3 +122,16 @@ class MedCalcBenchMetric(Metric):
122
122
  return [
123
123
  Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
124
124
  ]
125
+
126
+ def get_metadata(self) -> List[MetricMetadata]:
127
+ return [
128
+ MetricMetadata(
129
+ name="medcalc_bench_accuracy",
130
+ display_name="MedCalc Accuracy",
131
+ short_display_name="MedCalc Accuracy",
132
+ description="Comparison based on category. Exact match for categories risk, severity and diagnosis. "
133
+ "Check if within range for the other categories.",
134
+ lower_is_better=False,
135
+ group=None,
136
+ ),
137
+ ]
@@ -1,12 +1,13 @@
1
1
  from typing import List
2
+ import re
3
+
2
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
5
  from helm.benchmark.adaptation.request_state import RequestState
4
- from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
5
7
  from helm.benchmark.metrics.metric_name import MetricName
6
8
  from helm.benchmark.metrics.metric_service import MetricService
7
9
  from helm.benchmark.metrics.statistic import Stat
8
10
  from helm.common.hierarchical_logger import hlog
9
- import re
10
11
  from helm.benchmark.scenarios.scenario import CORRECT_TAG
11
12
 
12
13
 
@@ -99,3 +100,25 @@ class MedecMetric(Metric):
99
100
  Stat(MetricName("medec_error_flag_accuracy")).add(flag_accuracy),
100
101
  Stat(MetricName("medec_error_sentence_accuracy")).add(sentence_accuracy),
101
102
  ]
103
+
104
+ def get_metadata(self) -> List[MetricMetadata]:
105
+ return [
106
+ MetricMetadata(
107
+ name="medec_error_flag_accuracy",
108
+ display_name="Medical Error Flag Accuracy",
109
+ short_display_name="MedecFlagAcc",
110
+ description="Measures how accurately the model identifies whether a clinical note contains an error "
111
+ "(binary classification of correct/incorrect).",
112
+ lower_is_better=False,
113
+ group=None,
114
+ ),
115
+ MetricMetadata(
116
+ name="medec_error_sentence_accuracy",
117
+ display_name="Medical Error Sentence Accuracy",
118
+ short_display_name="MedecSentenceAcc",
119
+ description="Measures how accurately the model identifies the specific erroneous sentence within a "
120
+ "clinical note.",
121
+ lower_is_better=False,
122
+ group=None,
123
+ ),
124
+ ]
@@ -34,7 +34,7 @@ class VietnameseToxicityMetric(Metric):
34
34
  def __repr__(self):
35
35
  return "ToxicityMetric()"
36
36
 
37
- def _get_toxicity_score(self, predictions: Dict) -> List[float]:
37
+ def _get_toxicity_score(self, predictions: List[Dict]) -> List[float]:
38
38
  """Extracts toxicity scores from the predictions.
39
39
 
40
40
  Args:
@@ -86,6 +86,28 @@ class Processor:
86
86
  return instance_stats
87
87
 
88
88
 
89
+ @dataclass(frozen=True)
90
+ class MetricMetadata:
91
+ name: str
92
+ """Internal name (usually no spaces, etc.)"""
93
+
94
+ display_name: Optional[str] = None
95
+ """What is displayed to the user"""
96
+
97
+ short_display_name: Optional[str] = None
98
+ """What is displayed to the user (e.g., in a table header)"""
99
+
100
+ description: Optional[str] = None
101
+ """Description of the metric"""
102
+
103
+ lower_is_better: Optional[bool] = None
104
+ """Whether a lower vaue for this metric corresponds to a better model
105
+ (e.g., False for accuracy, True for perplexity, None for num_trials)"""
106
+
107
+ group: Optional[str] = None
108
+ """Name of the default metric group for this metric"""
109
+
110
+
89
111
  class MetricInterface(ABC):
90
112
  """Interface for all Metrics."""
91
113
 
@@ -95,6 +117,9 @@ class MetricInterface(ABC):
95
117
  ) -> MetricResult:
96
118
  pass
97
119
 
120
+ def get_metadata(self) -> List[MetricMetadata]:
121
+ raise NotImplementedError()
122
+
98
123
 
99
124
  class Metric(MetricInterface, ABC):
100
125
  """
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from helm.benchmark.metrics.metric import Metric
4
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
5
5
  from helm.benchmark.metrics.metric_name import MetricName
6
6
  from helm.benchmark.metrics.metric_service import MetricService
7
7
  from helm.benchmark.metrics.statistic import Stat
@@ -94,3 +94,34 @@ class MIMICIVBillingCodeMetric(Metric):
94
94
  Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
95
95
  Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
96
96
  ]
97
+
98
+ def get_metadata(self) -> List[MetricMetadata]:
99
+ return [
100
+ MetricMetadata(
101
+ name="mimiciv_billing_code_precision",
102
+ display_name="Precision for MIMIC Billing Codes",
103
+ short_display_name="MIMICBillingPre",
104
+ description="Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by "
105
+ "the model.",
106
+ lower_is_better=False,
107
+ group=None,
108
+ ),
109
+ MetricMetadata(
110
+ name="mimiciv_billing_code_recall",
111
+ display_name="Recall for MIMIC Billing Codes",
112
+ short_display_name="MIMICBillingRec",
113
+ description="Measures the proportion of correctly predicted ICD codes among all ICD codes present in "
114
+ "the gold standard.",
115
+ lower_is_better=False,
116
+ group=None,
117
+ ),
118
+ MetricMetadata(
119
+ name="mimiciv_billing_code_f1",
120
+ display_name="F1 Score for MIMIC Billing Codes",
121
+ short_display_name="MIMICBillingF1",
122
+ description="Measures the harmonic mean of precision and recall for ICD codes, providing a balanced "
123
+ "evaluation of the model's performance.",
124
+ lower_is_better=False,
125
+ group=None,
126
+ ),
127
+ ]
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -30,3 +30,15 @@ class OmniMATHMetric(Metric):
30
30
  return [
31
31
  Stat(MetricName("omni_math_accuracy")).add(score),
32
32
  ]
33
+
34
+ def get_metadata(self) -> List[MetricMetadata]:
35
+ return [
36
+ MetricMetadata(
37
+ name="omni_math_accuracy",
38
+ display_name="Omni-MATH Accuracy",
39
+ short_display_name="Acc",
40
+ description="Accuracy of the AI output judged by GPT-4.",
41
+ lower_is_better=False,
42
+ group="accuracy",
43
+ ),
44
+ ]
@@ -8,7 +8,7 @@ from sacrebleu.metrics import CHRF
8
8
 
9
9
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
10
10
  from helm.benchmark.adaptation.request_state import RequestState
11
- from helm.benchmark.metrics.metric import Metric
11
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.metrics.metric_service import MetricService
14
14
  from helm.benchmark.metrics.statistic import Stat
@@ -73,6 +73,19 @@ class SEAHELMMachineTranslationMetric(Metric):
73
73
 
74
74
  return result
75
75
 
76
+ def get_metadata(self) -> List[MetricMetadata]:
77
+ return [
78
+ MetricMetadata(
79
+ name="chr_f_plus_plus",
80
+ display_name="ChrF++",
81
+ description="Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, "
82
+ "2015)](https://aclanthology.org/W15-3049/). Code can be found "
83
+ "[here](https://github.com/mjpost/sacrebleu).",
84
+ lower_is_better=False,
85
+ group=None,
86
+ ),
87
+ ]
88
+
76
89
 
77
90
  class SEAHELMQAMetric(Metric):
78
91
  """SEAHELM QA Metrics
@@ -219,7 +219,7 @@ class SummaCConv(torch.nn.Module):
219
219
  imager_load_cache=True,
220
220
  agg="mean",
221
221
  norm_histo=False,
222
- **kwargs
222
+ **kwargs,
223
223
  ):
224
224
  # `bins` should be `even%d` or `percentiles`
225
225
  assert nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"], "Unrecognized nli_labels argument %s" % (
@@ -240,7 +240,7 @@ class SummaCConv(torch.nn.Module):
240
240
 
241
241
  if "even" in bins:
242
242
  n_bins = int(bins.replace("even", ""))
243
- self.bins = list(np.arange(0, 1, 1 / n_bins)) + [1.0]
243
+ self.bins = np.arange(0, 1, 1 / n_bins).tolist() + [1.0]
244
244
  elif bins == "percentile":
245
245
  self.bins = [
246
246
  0.0,
@@ -405,7 +405,7 @@ class SummaCZS:
405
405
  use_con=True,
406
406
  imager_load_cache=True,
407
407
  device="cuda",
408
- **kwargs
408
+ **kwargs,
409
409
  ):
410
410
  assert op2 in ["min", "mean", "max"], "Unrecognized `op2`"
411
411
  assert op1 in ["max", "mean", "min"], "Unrecognized `op1`"
@@ -16,7 +16,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
16
16
  from helm.common.hierarchical_logger import hlog
17
17
  from helm.common.general import ensure_file_downloaded
18
18
  from helm.common.optional_dependencies import handle_module_not_found_error
19
- from helm.benchmark.metrics.metric import Metric, MetricResult
19
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata, MetricResult
20
20
  from helm.benchmark.metrics.metric_name import MetricName
21
21
  from helm.benchmark.metrics.metric_service import MetricService
22
22
  from helm.benchmark.metrics.statistic import Stat
@@ -240,6 +240,134 @@ class SummarizationMetric(Metric):
240
240
 
241
241
  return result
242
242
 
243
+ def get_metadata(self):
244
+ metadata: List[MetricMetadata] = [
245
+ MetricMetadata(
246
+ name="QAFactEval",
247
+ display_name="QAFactEval",
248
+ description="Faithfulness scores based on the SummaC method of [Laban et al. "
249
+ "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
250
+ lower_is_better=False,
251
+ group="summarization_metrics",
252
+ ),
253
+ MetricMetadata(
254
+ name="summarization_coverage",
255
+ display_name="Coverage",
256
+ description="Extent to which the model-generated summaries are extractive fragments from the source "
257
+ "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
258
+ lower_is_better=None,
259
+ group="summarization_metrics",
260
+ ),
261
+ MetricMetadata(
262
+ name="summarization_density",
263
+ display_name="Density",
264
+ description="Extent to which the model-generated summaries are extractive summaries based on the "
265
+ "source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
266
+ lower_is_better=None,
267
+ group="summarization_metrics",
268
+ ),
269
+ MetricMetadata(
270
+ name="summarization_compression",
271
+ display_name="Compression",
272
+ description="Extent to which the model-generated summaries are compressed relative to the source "
273
+ "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
274
+ lower_is_better=None,
275
+ group="summarization_metrics",
276
+ ),
277
+ MetricMetadata(
278
+ name="rouge_1",
279
+ display_name="ROUGE-1",
280
+ short_display_name="ROUGE-1",
281
+ description="ROUGE-1",
282
+ lower_is_better=False,
283
+ group="summarization_metrics",
284
+ ),
285
+ MetricMetadata(
286
+ name="rouge-2",
287
+ display_name="ROUGE-2",
288
+ short_display_name="ROUGE-2",
289
+ description="ROUGE-2",
290
+ lower_is_better=False,
291
+ group="summarization_metrics",
292
+ ),
293
+ MetricMetadata(
294
+ name="rouge-l",
295
+ display_name="ROUGE-L",
296
+ short_display_name="ROUGE-L",
297
+ description="ROUGE-L",
298
+ lower_is_better=False,
299
+ group="summarization_metrics",
300
+ ),
301
+ ]
302
+ if self.humaneval is not None:
303
+ metadata.extend(
304
+ [
305
+ MetricMetadata(
306
+ name="HumanEval-faithfulness",
307
+ display_name="HumanEval-faithfulness",
308
+ description="Human evaluation score for faithfulness.",
309
+ lower_is_better=False,
310
+ group="summarization_metrics",
311
+ ),
312
+ MetricMetadata(
313
+ name="HumanEval-relevance",
314
+ display_name="HumanEval-relevance",
315
+ description="Human evaluation score for relevance.",
316
+ lower_is_better=False,
317
+ group="summarization_metrics",
318
+ ),
319
+ MetricMetadata(
320
+ name="HumanEval-coherence",
321
+ display_name="HumanEval-coherence",
322
+ description="Human evaluation score for coherence.",
323
+ lower_is_better=False,
324
+ group="summarization_metrics",
325
+ ),
326
+ ]
327
+ )
328
+ if self.compute_faithfulness:
329
+ metadata.append(
330
+ MetricMetadata(
331
+ name="summac",
332
+ display_name="SummaC",
333
+ description="Faithfulness scores based on the SummaC method of [Laban et al. "
334
+ "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
335
+ lower_is_better=False,
336
+ group="summarization_metrics",
337
+ )
338
+ )
339
+ if self.compute_bertscore:
340
+ metadata.extend(
341
+ [
342
+ MetricMetadata(
343
+ name="BERTScore-P",
344
+ display_name="BERTScore (P)",
345
+ description="Average BERTScore precision [(Zhang et al., "
346
+ "2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference "
347
+ "summary.",
348
+ lower_is_better=False,
349
+ group=None,
350
+ ),
351
+ MetricMetadata(
352
+ name="BERTScore-R",
353
+ display_name="BERTScore (R)",
354
+ description="Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
355
+ "between model generation and reference summary.",
356
+ lower_is_better=False,
357
+ group=None,
358
+ ),
359
+ MetricMetadata(
360
+ name="BERTScore-F",
361
+ display_name="BERTScore (F1)",
362
+ description="Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
363
+ "between model generation and reference summary.",
364
+ lower_is_better=False,
365
+ group="summarization_metrics",
366
+ ),
367
+ ]
368
+ )
369
+ return metadata
370
+
243
371
 
244
372
  def _paired_bootstrap_test(treatment_list: list, control_list: list, nboot: int = 10000):
245
373
  """
@@ -6,7 +6,7 @@ from helm.common.hierarchical_logger import hlog, hwarn
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
8
  from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
9
- from helm.benchmark.metrics.metric import Metric
9
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
10
10
  from helm.benchmark.metrics.metric_name import MetricName
11
11
  from helm.benchmark.metrics.metric_service import MetricService
12
12
  from helm.benchmark.metrics.statistic import Stat
@@ -89,3 +89,33 @@ class ToxicityMetric(Metric):
89
89
  ]
90
90
 
91
91
  return stats
92
+
93
+ def get_metadata(self) -> List[MetricMetadata]:
94
+ return [
95
+ MetricMetadata(
96
+ name="expected_max_toxicity",
97
+ display_name="Average max toxicity",
98
+ short_display_name="Avg. max toxicity",
99
+ description="Average max toxicity score across all completions (based on the PerspectiveAPI toxicity "
100
+ "classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
101
+ lower_is_better=True,
102
+ group=None,
103
+ ),
104
+ MetricMetadata(
105
+ name="max_toxicity_probability",
106
+ display_name="max toxicity",
107
+ description="Fraction of instances where most toxic completion is above 0.5 toxicity (based on the "
108
+ "PerspectiveAPI toxicity classifier; [Gehman et al. "
109
+ "(2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
110
+ lower_is_better=True,
111
+ group=None,
112
+ ),
113
+ MetricMetadata(
114
+ name="toxic_frac",
115
+ display_name="Toxic fraction",
116
+ description="Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity "
117
+ "classifier).",
118
+ lower_is_better=True,
119
+ group="toxicity",
120
+ ),
121
+ ]
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
32
32
  Stat(MetricName("wildbench_score")).add(score),
33
33
  Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
34
34
  ]
35
+
36
+ def get_metadata(self) -> List[MetricMetadata]:
37
+ return [
38
+ MetricMetadata(
39
+ name="wildbench_score",
40
+ display_name="WildBench Score",
41
+ short_display_name="WB Score",
42
+ description="Score of the AI output judged by GPT-4o.",
43
+ lower_is_better=False,
44
+ group="accuracy",
45
+ ),
46
+ MetricMetadata(
47
+ name="wildbench_score_rescaled",
48
+ display_name="WildBench Score",
49
+ short_display_name="WB Score",
50
+ description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
51
+ lower_is_better=False,
52
+ group="accuracy",
53
+ ),
54
+ ]
@@ -157,12 +157,11 @@ def get_default_model_deployment_for_model(
157
157
  Example: "meta/llama-7b" => "together/llama-7b"
158
158
 
159
159
  The process to find a model deployment name is as follows:
160
- 1. If there is a model deployment with the same name as the model arg, use it.
161
- 2. If there is at least one deployment for the model, use the first one that is available.
162
- 3. If there are no deployments for the model, returns None.
160
+ 1. If there is at least one deployment for the model, use the last one that is available.
161
+ 2. If there are no deployments for the model, returns None.
163
162
 
164
163
  This function will also try to find a model deployment name that is not deprecated.
165
- If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
164
+ If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
166
165
  If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
167
166
 
168
167
  If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
@@ -175,16 +174,7 @@ def get_default_model_deployment_for_model(
175
174
  ignore_deprecated: Whether to return None if the model deployment is deprecated.
176
175
  """
177
176
 
178
- # If there is a model deployment with the same name as the model arg, use it.
179
- if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
180
- deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
181
- if deployment.deprecated and ignore_deprecated:
182
- if warn_arg_deprecated:
183
- hwarn(f"Model deployment {model_name} is deprecated")
184
- return None
185
- return deployment.name
186
-
187
- # If there is at least one deployment for the model, use the first one that is available.
177
+ # If there is at least one deployment for the model, use the last one that is available.
188
178
  available_deployments: List[ModelDeployment] = [
189
179
  deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
190
180
  ]
@@ -199,19 +189,21 @@ def get_default_model_deployment_for_model(
199
189
  deployment for deployment in available_deployments if not deployment.deprecated
200
190
  ]
201
191
  if len(non_deprecated_deployments) > 0:
202
- chosen_deployment = non_deprecated_deployments[0]
192
+ chosen_deployment = non_deprecated_deployments[-1]
203
193
  # There are no non-deprecated deployments, so there are two options:
204
194
  # 1. If we can return an empty string, return it. (no model deployment is available)
205
- # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
195
+ # 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
206
196
  elif ignore_deprecated:
207
197
  return None
208
- else:
209
- chosen_deployment = available_deployments[0]
198
+ elif len(available_deployments) > 0:
199
+ chosen_deployment = available_deployments[-1]
210
200
  if warn_arg_deprecated:
211
201
  hwarn(f"All model deployments for model {model_name} are deprecated.")
202
+ else:
203
+ return None
212
204
  if warn_arg_deprecated:
213
205
  hlog(
214
- f"Choosing {chosen_deployment.name} (the first one) as "
206
+ f"Choosing {chosen_deployment.name} (the last one) as "
215
207
  f"the default model deployment for model {model_name}"
216
208
  )
217
209
  hlog("If you want to use a different model deployment, please specify it explicitly.")