crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  import cattrs
5
5
  import yaml
6
6
 
7
- from helm.common.hierarchical_logger import hlog
7
+ from helm.common.hierarchical_logger import hlog, hwarn
8
8
  from helm.common.object_spec import ObjectSpec
9
9
  from helm.benchmark.model_metadata_registry import (
10
10
  ModelMetadata,
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
104
104
  try:
105
105
  model_metadata = get_model_metadata(model_name)
106
106
  except ValueError:
107
- hlog(
108
- f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
109
- )
107
+ hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
110
108
  model_metadata = get_unknown_model_metadata(model_name)
111
109
  register_model_metadata(model_metadata)
112
110
  deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
130
128
  raise ValueError(f"Model deployment {name} not found")
131
129
  deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
132
130
  if deployment.deprecated and warn_deprecated:
133
- hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
131
+ hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
134
132
  return deployment
135
133
 
136
134
 
@@ -159,12 +157,11 @@ def get_default_model_deployment_for_model(
159
157
  Example: "meta/llama-7b" => "together/llama-7b"
160
158
 
161
159
  The process to find a model deployment name is as follows:
162
- 1. If there is a model deployment with the same name as the model arg, use it.
163
- 2. If there is at least one deployment for the model, use the first one that is available.
164
- 3. If there are no deployments for the model, returns None.
160
+ 1. If there is at least one deployment for the model, use the last one that is available.
161
+ 2. If there are no deployments for the model, returns None.
165
162
 
166
163
  This function will also try to find a model deployment name that is not deprecated.
167
- If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
164
+ If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
168
165
  If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
169
166
 
170
167
  If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
@@ -177,23 +174,14 @@ def get_default_model_deployment_for_model(
177
174
  ignore_deprecated: Whether to return None if the model deployment is deprecated.
178
175
  """
179
176
 
180
- # If there is a model deployment with the same name as the model arg, use it.
181
- if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
182
- deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
183
- if deployment.deprecated and ignore_deprecated:
184
- if warn_arg_deprecated:
185
- hlog(f"WARNING: Model deployment {model_name} is deprecated")
186
- return None
187
- return deployment.name
188
-
189
- # If there is at least one deployment for the model, use the first one that is available.
177
+ # If there is at least one deployment for the model, use the last one that is available.
190
178
  available_deployments: List[ModelDeployment] = [
191
179
  deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
192
180
  ]
193
181
  if len(available_deployments) > 0:
194
182
  available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
195
183
  if warn_arg_deprecated:
196
- hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
184
+ hwarn("Model name is deprecated. Please use the model deployment name instead.")
197
185
  hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
198
186
 
199
187
  # Additionally, if there is a non-deprecated deployment, use it.
@@ -201,19 +189,21 @@ def get_default_model_deployment_for_model(
201
189
  deployment for deployment in available_deployments if not deployment.deprecated
202
190
  ]
203
191
  if len(non_deprecated_deployments) > 0:
204
- chosen_deployment = non_deprecated_deployments[0]
192
+ chosen_deployment = non_deprecated_deployments[-1]
205
193
  # There are no non-deprecated deployments, so there are two options:
206
194
  # 1. If we can return an empty string, return it. (no model deployment is available)
207
- # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
195
+ # 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
208
196
  elif ignore_deprecated:
209
197
  return None
210
- else:
211
- chosen_deployment = available_deployments[0]
198
+ elif len(available_deployments) > 0:
199
+ chosen_deployment = available_deployments[-1]
212
200
  if warn_arg_deprecated:
213
- hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
201
+ hwarn(f"All model deployments for model {model_name} are deprecated.")
202
+ else:
203
+ return None
214
204
  if warn_arg_deprecated:
215
205
  hlog(
216
- f"Choosing {chosen_deployment.name} (the first one) as "
206
+ f"Choosing {chosen_deployment.name} (the last one) as "
217
207
  f"the default model deployment for model {model_name}"
218
208
  )
219
209
  hlog("If you want to use a different model deployment, please specify it explicitly.")
@@ -4,7 +4,7 @@ import dacite
4
4
  import importlib_resources as resources
5
5
  import yaml
6
6
 
7
- from helm.common.hierarchical_logger import htrack, hlog
7
+ from helm.common.hierarchical_logger import htrack, hlog, hwarn
8
8
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
9
9
  from helm.benchmark.presentation.schema import Schema
10
10
 
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
71
71
  for point in contamination.points:
72
72
  for model in point.models:
73
73
  if model not in MODEL_NAME_TO_MODEL_METADATA:
74
- hlog(f"WARNING: model {model} not defined in schema")
74
+ hwarn(f"model {model} not defined in schema")
75
75
  for group in point.groups:
76
76
  if group not in schema.name_to_run_group:
77
- hlog(f"WARNING: group {group} not defined in schema")
77
+ hwarn(f"group {group} not defined in schema")
78
78
 
79
79
 
80
80
  def read_contamination():
@@ -1,4 +1,7 @@
1
- # mypy: check_untyped_defs = False
1
+ # type: ignore
2
+ # flake8: noqa
3
+ # fmt: off
4
+
2
5
  import argparse
3
6
  from collections import defaultdict
4
7
  from dataclasses import dataclass
@@ -11,7 +14,7 @@ import numpy as np
11
14
  from scipy.stats import pearsonr
12
15
 
13
16
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
14
- from helm.common.hierarchical_logger import hlog
17
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
15
18
  from helm.common.optional_dependencies import handle_module_not_found_error
16
19
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
17
20
 
@@ -600,17 +603,7 @@ class Plotter:
600
603
  self.create_constrast_set_plots()
601
604
 
602
605
 
603
- def main():
604
- """
605
- This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
606
- It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
607
- the top-level command `helm-create-plots`.
608
- """
609
- parser = argparse.ArgumentParser()
610
- parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
611
- parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
612
- parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
613
- args = parser.parse_args()
606
+ def create_plots(args):
614
607
  register_builtin_configs_from_helm_package()
615
608
  base_path = os.path.join(args.output_path, "runs", args.suite)
616
609
  if not os.path.exists(os.path.join(base_path, "groups")):
@@ -621,5 +614,42 @@ def main():
621
614
  plotter.create_all_plots()
622
615
 
623
616
 
617
+ def main():
618
+ """
619
+ This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
620
+ It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
621
+ the top-level command `helm-create-plots`.
622
+ """
623
+ parser = argparse.ArgumentParser()
624
+ parser.add_argument(
625
+ "-o",
626
+ "--output-path",
627
+ type=str,
628
+ help="Path to benchmarking output",
629
+ default="benchmark_output",
630
+ )
631
+ parser.add_argument(
632
+ "--suite",
633
+ type=str,
634
+ help="Name of the suite that we are plotting",
635
+ required=True,
636
+ )
637
+ parser.add_argument(
638
+ "--plot-format",
639
+ help="Format for saving plots",
640
+ default="png",
641
+ choices=["png", "pdf"],
642
+ )
643
+ parser.add_argument(
644
+ "--log-config",
645
+ type=str,
646
+ default=None,
647
+ help="PATH to a YAML file to customize logging",
648
+ )
649
+ args = parser.parse_args()
650
+ setup_default_logging(args.log_config)
651
+ create_plots(args)
652
+
653
+
624
654
  if __name__ == "__main__":
625
655
  main()
@@ -59,6 +59,9 @@ class DisplayPrediction:
59
59
 
60
60
  annotations: Optional[Dict[str, Any]]
61
61
 
62
+ thinking_text: Optional[str]
63
+ """Thinking text from thinking models."""
64
+
62
65
 
63
66
  @dataclass(frozen=True)
64
67
  class DisplayRequest:
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
266
269
  request_state.instance
267
270
  )
268
271
 
272
+ if request_state.result.completions[0].multimodal_content:
273
+ additional_prediction: str = request_state.result.completions[0].multimodal_content.text
274
+ if additional_prediction:
275
+ predicted_text = f"{additional_prediction} {predicted_text}"
276
+
269
277
  # Process images and include if they exist
270
278
  images: List[str] = [
271
279
  encode_base64(image_location)
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
273
281
  if os.path.exists(image_location)
274
282
  ]
275
283
 
284
+ thinking_text: Optional[str] = (
285
+ request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
286
+ )
287
+
276
288
  predictions.append(
277
289
  DisplayPrediction(
278
290
  instance_id=request_state.instance.id,
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
285
297
  reference_index=request_state.reference_index,
286
298
  stats=trial_stats,
287
299
  annotations=request_state.annotations,
300
+ thinking_text=thinking_text,
288
301
  )
289
302
  )
290
303
  requests.append(
@@ -11,6 +11,7 @@ import importlib_resources as resources
11
11
  from helm.common.general import hlog
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
14
+ from helm.common.hierarchical_logger import hwarn
14
15
 
15
16
 
16
17
  # TODO: change to `helm.benchmark.config`
@@ -204,6 +205,11 @@ class RunGroup(Field):
204
205
  # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
205
206
  adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
206
207
 
208
+ # Optional short description of the run group.
209
+ # This description is used in some space-constrained places in frontend tables.
210
+ # If unset, the description field will be used instead.
211
+ short_description: Optional[str] = None
212
+
207
213
 
208
214
  @dataclass
209
215
  class Schema:
@@ -281,5 +287,5 @@ def read_schema(schema_path: str) -> Schema:
281
287
  raw = yaml.safe_load(f)
282
288
  schema = dacite.from_dict(Schema, raw)
283
289
  if schema.adapter:
284
- hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
290
+ hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
285
291
  return dataclasses.replace(schema, adapter=get_adapter_fields())
@@ -30,7 +30,7 @@ from helm.common.general import (
30
30
  unique_simplification,
31
31
  )
32
32
  from helm.common.codec import from_json
33
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
33
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
34
34
  from helm.benchmark.scenarios.scenario import ScenarioSpec
35
35
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
36
36
  from helm.benchmark.metrics.metric_name import MetricName
@@ -102,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
102
102
  # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
103
103
  # as the main metric but multiple_choice_separate_original only generates exact_match
104
104
  if matcher.name == "quasi_exact_match":
105
- hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
105
+ hwarn("No quasi_exact_match metric found, looking for exact_match instead")
106
106
  matcher = replace(matcher, name="exact_match")
107
107
  matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
108
108
  if len(matching_stats) == 0:
@@ -294,7 +294,6 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
294
294
 
295
295
 
296
296
  class AggregationStrategy:
297
- # TODO: Convert to StrEnum after upgrading to Python 3.11
298
297
  WIN_RATE = "win_rate"
299
298
  MEAN = "mean"
300
299
 
@@ -406,8 +405,8 @@ class Summarizer:
406
405
  included = False
407
406
  for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
408
407
  if run_group_name not in self.schema.name_to_run_group:
409
- hlog(
410
- f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
408
+ hwarn(
409
+ f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
411
410
  f"but undefined in {self.schema_path}, skipping"
412
411
  )
413
412
  continue
@@ -440,14 +439,14 @@ class Summarizer:
440
439
  run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
441
440
  stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
442
441
  if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
443
- hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
442
+ hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
444
443
  continue
445
444
  run_path: str = os.path.join(run_suite_path, run_dir_name)
446
445
  run = self.read_run(run_path)
447
446
  self.runs.append(run)
448
447
  if run.run_spec.name in self.runs_to_run_suites:
449
- hlog(
450
- f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
448
+ hwarn(
449
+ f"Run entry {run.run_spec.name} is present in two different Run Suites. "
451
450
  f"Defaulting to the latest assigned suite: {suite}"
452
451
  )
453
452
  self.runs_to_run_suites[run.run_spec.name] = suite
@@ -544,8 +543,8 @@ class Summarizer:
544
543
 
545
544
  for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
546
545
  if metric_name not in defined_metric_names:
547
- hlog(
548
- f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
546
+ hwarn(
547
+ f"metric name {metric_name} undefined in {self.schema_path} "
549
548
  f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
550
549
  )
551
550
 
@@ -738,8 +737,8 @@ class Summarizer:
738
737
  if stat is None:
739
738
  # Print out near misses to provide a more informative warning
740
739
  near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
741
- hlog(
742
- f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
740
+ hwarn(
741
+ f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
743
742
  f"{len(near_misses)} near misses matching just the name"
744
743
  )
745
744
  if len(near_misses) > 0:
@@ -810,7 +809,7 @@ class Summarizer:
810
809
  # Create header (cells to display) and the list of metric name filters
811
810
  # (to pull out information later).
812
811
  if not columns or not adapter_to_runs:
813
- hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
812
+ hwarn(f"table {title}, has no rows or columns, leaving empty")
814
813
  return Table("empty", [], [])
815
814
 
816
815
  header: List[HeaderCell] = []
@@ -831,7 +830,7 @@ class Summarizer:
831
830
  matcher = replace(matcher, sub_split=sub_split)
832
831
  header_field = self.schema.name_to_metric.get(matcher.name)
833
832
  if header_field is None:
834
- hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
833
+ hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
835
834
  continue
836
835
  metadata = {
837
836
  "metric": header_field.get_short_display_name(),
@@ -839,7 +838,8 @@ class Summarizer:
839
838
  }
840
839
 
841
840
  header_name = header_field.get_short_display_name()
842
- description = (run_group.description + "\n\n" if run_group.description is not None else "") + (
841
+ run_group_short_description = run_group.short_description or run_group.description or ""
842
+ description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
843
843
  (header_field.display_name if header_field.display_name else header_field.name)
844
844
  + ": "
845
845
  + (header_field.description if header_field.description is not None else "")
@@ -959,8 +959,8 @@ class Summarizer:
959
959
  all_run_spec_names = []
960
960
  for adapter_spec, runs in adapter_to_runs.items():
961
961
  if len(runs) > 1:
962
- hlog(
963
- f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
962
+ hwarn(
963
+ f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
964
964
  f" {[run.run_spec.name for run in runs]}"
965
965
  )
966
966
  for run in runs:
@@ -1232,10 +1232,57 @@ class Summarizer:
1232
1232
 
1233
1233
 
1234
1234
  @htrack("summarize")
1235
+ def summarize(args):
1236
+ release: Optional[str] = None
1237
+ suites: Optional[str] = None
1238
+ suite: Optional[str] = None
1239
+ if args.suite and (args.release or args.suites):
1240
+ raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1241
+ elif args.suite:
1242
+ # Comment this out while we have a trial period for the `release` method.
1243
+ # hlog(
1244
+ # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1245
+ # "where --release specifies the name of a release and --suites specifies several run suites "
1246
+ # "to be included in that release."
1247
+ # )
1248
+ suite = args.suite
1249
+ elif args.release or args.suites:
1250
+ if not args.release or not args.suites:
1251
+ raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1252
+ release = args.release
1253
+ suites = args.suites
1254
+ else:
1255
+ raise ValueError("Exactly one of --release or --suite must be specified.")
1256
+
1257
+ schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1258
+
1259
+ register_builtin_configs_from_helm_package()
1260
+ register_configs_from_directory(args.local_path)
1261
+
1262
+ # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1263
+ summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1264
+ summarizer = summarizer_cls(
1265
+ release=release,
1266
+ suites=suites,
1267
+ suite=suite,
1268
+ schema_path=schema_path,
1269
+ output_path=args.output_path,
1270
+ verbose=args.debug,
1271
+ num_threads=args.num_threads,
1272
+ allow_unknown_models=args.allow_unknown_models,
1273
+ )
1274
+ summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1275
+ hlog("Done.")
1276
+
1277
+
1235
1278
  def main():
1236
1279
  parser = argparse.ArgumentParser()
1237
1280
  parser.add_argument(
1238
- "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
1281
+ "-o",
1282
+ "--output-path",
1283
+ type=str,
1284
+ help="Where the benchmarking output lives",
1285
+ default="benchmark_output",
1239
1286
  )
1240
1287
  parser.add_argument(
1241
1288
  "--schema-path",
@@ -1253,9 +1300,18 @@ def main():
1253
1300
  help="Experimental: Name of the release this summarization should go under.",
1254
1301
  )
1255
1302
  parser.add_argument(
1256
- "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
1303
+ "--suites",
1304
+ type=str,
1305
+ nargs="+",
1306
+ help="Experimental: List of suites to summarize for this this release.",
1307
+ )
1308
+ parser.add_argument(
1309
+ "-n",
1310
+ "--num-threads",
1311
+ type=int,
1312
+ help="Max number of threads used to summarize",
1313
+ default=8,
1257
1314
  )
1258
- parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
1259
1315
  parser.add_argument(
1260
1316
  "--debug",
1261
1317
  action="store_true",
@@ -1284,48 +1340,15 @@ def main():
1284
1340
  default=None,
1285
1341
  help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1286
1342
  )
1287
- args = parser.parse_args()
1288
-
1289
- release: Optional[str] = None
1290
- suites: Optional[str] = None
1291
- suite: Optional[str] = None
1292
- if args.suite and (args.release or args.suites):
1293
- raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1294
- elif args.suite:
1295
- # Comment this out while we have a trial period for the `release` method.
1296
- # hlog(
1297
- # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1298
- # "where --release specifies the name of a release and --suites specifies several run suites "
1299
- # "to be included in that release."
1300
- # )
1301
- suite = args.suite
1302
- elif args.release or args.suites:
1303
- if not args.release or not args.suites:
1304
- raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1305
- release = args.release
1306
- suites = args.suites
1307
- else:
1308
- raise ValueError("Exactly one of --release or --suite must be specified.")
1309
-
1310
- schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1311
-
1312
- register_builtin_configs_from_helm_package()
1313
- register_configs_from_directory(args.local_path)
1314
-
1315
- # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1316
- summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1317
- summarizer = summarizer_cls(
1318
- release=release,
1319
- suites=suites,
1320
- suite=suite,
1321
- schema_path=schema_path,
1322
- output_path=args.output_path,
1323
- verbose=args.debug,
1324
- num_threads=args.num_threads,
1325
- allow_unknown_models=args.allow_unknown_models,
1343
+ parser.add_argument(
1344
+ "--log-config",
1345
+ type=str,
1346
+ default=None,
1347
+ help="PATH to a YAML file to customize logging",
1326
1348
  )
1327
- summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1328
- hlog("Done.")
1349
+ args = parser.parse_args()
1350
+ setup_default_logging(args.log_config)
1351
+ summarize(args)
1329
1352
 
1330
1353
 
1331
1354
  if __name__ == "__main__":
@@ -1,4 +1,7 @@
1
- # mypy: check_untyped_defs = False
1
+ # type: ignore
2
+ # flake8: noqa
3
+ # fmt: off
4
+
2
5
  from helm.common.general import asdict_without_nones
3
6
  from helm.benchmark.presentation.table import Table, Cell, HeaderCell
4
7
  from helm.benchmark.presentation.create_plots import parse_table
@@ -6,7 +6,7 @@ from typing import List
6
6
  from helm.benchmark import model_metadata_registry
7
7
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
8
  from helm.common.general import ensure_directory_exists
9
- from helm.common.hierarchical_logger import hlog, htrack
9
+ from helm.common.hierarchical_logger import hlog, htrack, hwarn
10
10
  from helm.common.authentication import Authentication
11
11
  from helm.proxy.services.remote_service import create_authentication, add_service_args
12
12
 
@@ -191,9 +191,8 @@ def main():
191
191
  )
192
192
 
193
193
  if args.run_specs:
194
- hlog(
195
- "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
196
- "Use --run-entries instead."
194
+ hwarn(
195
+ "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
197
196
  )
198
197
 
199
198
  hlog("Done.")
@@ -12,7 +12,7 @@ from datasets import load_dataset
12
12
 
13
13
  from helm.benchmark.adaptation.request_state import RequestState
14
14
  from helm.common.general import ensure_directory_exists, write, asdict_without_nones
15
- from helm.common.hierarchical_logger import hlog, htrack_block
15
+ from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
16
16
  from helm.common.cache import cache_stats
17
17
  from helm.benchmark.scenarios.scenario import (
18
18
  Scenario,
@@ -193,7 +193,7 @@ class REEvalRunner(Runner):
193
193
  difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
194
194
  prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
195
195
  except ValueError:
196
- hlog(f"WARNING: no available difficulty for {split_name}, skipping")
196
+ hwarn(f"no available difficulty for {split_name}, skipping")
197
197
  return
198
198
 
199
199
  unasked_request_states: List[RequestState] = []
@@ -320,7 +320,7 @@ class REEvalRunner(Runner):
320
320
  metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
321
321
  for metric_name, count in metric_counts.items():
322
322
  if count > 1:
323
- hlog(f"WARNING: duplicate metric name {metric_name}")
323
+ hwarn(f"duplicate metric name {metric_name}")
324
324
 
325
325
  # Print out the number of stats
326
326
  hlog(f"Generated {len(stats)} stats.")