crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  import cattrs
5
5
  import yaml
6
6
 
7
- from helm.common.hierarchical_logger import hlog
7
+ from helm.common.hierarchical_logger import hlog, hwarn
8
8
  from helm.common.object_spec import ObjectSpec
9
9
  from helm.benchmark.model_metadata_registry import (
10
10
  ModelMetadata,
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
104
104
  try:
105
105
  model_metadata = get_model_metadata(model_name)
106
106
  except ValueError:
107
- hlog(
108
- f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
109
- )
107
+ hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
110
108
  model_metadata = get_unknown_model_metadata(model_name)
111
109
  register_model_metadata(model_metadata)
112
110
  deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
130
128
  raise ValueError(f"Model deployment {name} not found")
131
129
  deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
132
130
  if deployment.deprecated and warn_deprecated:
133
- hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
131
+ hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
134
132
  return deployment
135
133
 
136
134
 
@@ -182,7 +180,7 @@ def get_default_model_deployment_for_model(
182
180
  deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
183
181
  if deployment.deprecated and ignore_deprecated:
184
182
  if warn_arg_deprecated:
185
- hlog(f"WARNING: Model deployment {model_name} is deprecated")
183
+ hwarn(f"Model deployment {model_name} is deprecated")
186
184
  return None
187
185
  return deployment.name
188
186
 
@@ -193,7 +191,7 @@ def get_default_model_deployment_for_model(
193
191
  if len(available_deployments) > 0:
194
192
  available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
195
193
  if warn_arg_deprecated:
196
- hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
194
+ hwarn("Model name is deprecated. Please use the model deployment name instead.")
197
195
  hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
198
196
 
199
197
  # Additionally, if there is a non-deprecated deployment, use it.
@@ -210,7 +208,7 @@ def get_default_model_deployment_for_model(
210
208
  else:
211
209
  chosen_deployment = available_deployments[0]
212
210
  if warn_arg_deprecated:
213
- hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
211
+ hwarn(f"All model deployments for model {model_name} are deprecated.")
214
212
  if warn_arg_deprecated:
215
213
  hlog(
216
214
  f"Choosing {chosen_deployment.name} (the first one) as "
@@ -4,7 +4,7 @@ import dacite
4
4
  import importlib_resources as resources
5
5
  import yaml
6
6
 
7
- from helm.common.hierarchical_logger import htrack, hlog
7
+ from helm.common.hierarchical_logger import htrack, hlog, hwarn
8
8
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
9
9
  from helm.benchmark.presentation.schema import Schema
10
10
 
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
71
71
  for point in contamination.points:
72
72
  for model in point.models:
73
73
  if model not in MODEL_NAME_TO_MODEL_METADATA:
74
- hlog(f"WARNING: model {model} not defined in schema")
74
+ hwarn(f"model {model} not defined in schema")
75
75
  for group in point.groups:
76
76
  if group not in schema.name_to_run_group:
77
- hlog(f"WARNING: group {group} not defined in schema")
77
+ hwarn(f"group {group} not defined in schema")
78
78
 
79
79
 
80
80
  def read_contamination():
@@ -11,7 +11,7 @@ import numpy as np
11
11
  from scipy.stats import pearsonr
12
12
 
13
13
  from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
14
- from helm.common.hierarchical_logger import hlog
14
+ from helm.common.hierarchical_logger import hlog, setup_default_logging
15
15
  from helm.common.optional_dependencies import handle_module_not_found_error
16
16
  from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
17
17
 
@@ -600,17 +600,7 @@ class Plotter:
600
600
  self.create_constrast_set_plots()
601
601
 
602
602
 
603
- def main():
604
- """
605
- This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
606
- It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
607
- the top-level command `helm-create-plots`.
608
- """
609
- parser = argparse.ArgumentParser()
610
- parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
611
- parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
612
- parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
613
- args = parser.parse_args()
603
+ def create_plots(args):
614
604
  register_builtin_configs_from_helm_package()
615
605
  base_path = os.path.join(args.output_path, "runs", args.suite)
616
606
  if not os.path.exists(os.path.join(base_path, "groups")):
@@ -621,5 +611,36 @@ def main():
621
611
  plotter.create_all_plots()
622
612
 
623
613
 
614
+ def main():
615
+ """
616
+ This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
617
+ It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
618
+ the top-level command `helm-create-plots`.
619
+ """
620
+ parser = argparse.ArgumentParser()
621
+ parser.add_argument(
622
+ "-o",
623
+ "--output-path",
624
+ type=str,
625
+ help="Path to benchmarking output",
626
+ default="benchmark_output",
627
+ )
628
+ parser.add_argument(
629
+ "--suite",
630
+ type=str,
631
+ help="Name of the suite that we are plotting",
632
+ required=True,
633
+ )
634
+ parser.add_argument(
635
+ "--plot-format",
636
+ help="Format for saving plots",
637
+ default="png",
638
+ choices=["png", "pdf"],
639
+ )
640
+ args = parser.parse_args()
641
+ setup_default_logging()
642
+ create_plots(args)
643
+
644
+
624
645
  if __name__ == "__main__":
625
646
  main()
@@ -59,6 +59,9 @@ class DisplayPrediction:
59
59
 
60
60
  annotations: Optional[Dict[str, Any]]
61
61
 
62
+ thinking_text: Optional[str]
63
+ """Thinking text from thinking models."""
64
+
62
65
 
63
66
  @dataclass(frozen=True)
64
67
  class DisplayRequest:
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
266
269
  request_state.instance
267
270
  )
268
271
 
272
+ if request_state.result.completions[0].multimodal_content:
273
+ additional_prediction: str = request_state.result.completions[0].multimodal_content.text
274
+ if additional_prediction:
275
+ predicted_text = f"{additional_prediction} {predicted_text}"
276
+
269
277
  # Process images and include if they exist
270
278
  images: List[str] = [
271
279
  encode_base64(image_location)
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
273
281
  if os.path.exists(image_location)
274
282
  ]
275
283
 
284
+ thinking_text: Optional[str] = (
285
+ request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
286
+ )
287
+
276
288
  predictions.append(
277
289
  DisplayPrediction(
278
290
  instance_id=request_state.instance.id,
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
285
297
  reference_index=request_state.reference_index,
286
298
  stats=trial_stats,
287
299
  annotations=request_state.annotations,
300
+ thinking_text=thinking_text,
288
301
  )
289
302
  )
290
303
  requests.append(
@@ -11,6 +11,7 @@ import importlib_resources as resources
11
11
  from helm.common.general import hlog
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
14
+ from helm.common.hierarchical_logger import hwarn
14
15
 
15
16
 
16
17
  # TODO: change to `helm.benchmark.config`
@@ -281,5 +282,5 @@ def read_schema(schema_path: str) -> Schema:
281
282
  raw = yaml.safe_load(f)
282
283
  schema = dacite.from_dict(Schema, raw)
283
284
  if schema.adapter:
284
- hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
285
+ hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
285
286
  return dataclasses.replace(schema, adapter=get_adapter_fields())
@@ -30,7 +30,7 @@ from helm.common.general import (
30
30
  unique_simplification,
31
31
  )
32
32
  from helm.common.codec import from_json
33
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
33
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
34
34
  from helm.benchmark.scenarios.scenario import ScenarioSpec
35
35
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
36
36
  from helm.benchmark.metrics.metric_name import MetricName
@@ -102,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
102
102
  # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
103
103
  # as the main metric but multiple_choice_separate_original only generates exact_match
104
104
  if matcher.name == "quasi_exact_match":
105
- hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
105
+ hwarn("No quasi_exact_match metric found, looking for exact_match instead")
106
106
  matcher = replace(matcher, name="exact_match")
107
107
  matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
108
108
  if len(matching_stats) == 0:
@@ -406,8 +406,8 @@ class Summarizer:
406
406
  included = False
407
407
  for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
408
408
  if run_group_name not in self.schema.name_to_run_group:
409
- hlog(
410
- f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
409
+ hwarn(
410
+ f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
411
411
  f"but undefined in {self.schema_path}, skipping"
412
412
  )
413
413
  continue
@@ -440,14 +440,14 @@ class Summarizer:
440
440
  run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
441
441
  stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
442
442
  if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
443
- hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
443
+ hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
444
444
  continue
445
445
  run_path: str = os.path.join(run_suite_path, run_dir_name)
446
446
  run = self.read_run(run_path)
447
447
  self.runs.append(run)
448
448
  if run.run_spec.name in self.runs_to_run_suites:
449
- hlog(
450
- f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
449
+ hwarn(
450
+ f"Run entry {run.run_spec.name} is present in two different Run Suites. "
451
451
  f"Defaulting to the latest assigned suite: {suite}"
452
452
  )
453
453
  self.runs_to_run_suites[run.run_spec.name] = suite
@@ -544,8 +544,8 @@ class Summarizer:
544
544
 
545
545
  for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
546
546
  if metric_name not in defined_metric_names:
547
- hlog(
548
- f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
547
+ hwarn(
548
+ f"metric name {metric_name} undefined in {self.schema_path} "
549
549
  f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
550
550
  )
551
551
 
@@ -738,8 +738,8 @@ class Summarizer:
738
738
  if stat is None:
739
739
  # Print out near misses to provide a more informative warning
740
740
  near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
741
- hlog(
742
- f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
741
+ hwarn(
742
+ f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
743
743
  f"{len(near_misses)} near misses matching just the name"
744
744
  )
745
745
  if len(near_misses) > 0:
@@ -810,7 +810,7 @@ class Summarizer:
810
810
  # Create header (cells to display) and the list of metric name filters
811
811
  # (to pull out information later).
812
812
  if not columns or not adapter_to_runs:
813
- hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
813
+ hwarn(f"table {title}, has no rows or columns, leaving empty")
814
814
  return Table("empty", [], [])
815
815
 
816
816
  header: List[HeaderCell] = []
@@ -831,7 +831,7 @@ class Summarizer:
831
831
  matcher = replace(matcher, sub_split=sub_split)
832
832
  header_field = self.schema.name_to_metric.get(matcher.name)
833
833
  if header_field is None:
834
- hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
834
+ hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
835
835
  continue
836
836
  metadata = {
837
837
  "metric": header_field.get_short_display_name(),
@@ -959,8 +959,8 @@ class Summarizer:
959
959
  all_run_spec_names = []
960
960
  for adapter_spec, runs in adapter_to_runs.items():
961
961
  if len(runs) > 1:
962
- hlog(
963
- f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
962
+ hwarn(
963
+ f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
964
964
  f" {[run.run_spec.name for run in runs]}"
965
965
  )
966
966
  for run in runs:
@@ -1232,10 +1232,57 @@ class Summarizer:
1232
1232
 
1233
1233
 
1234
1234
  @htrack("summarize")
1235
+ def summarize(args):
1236
+ release: Optional[str] = None
1237
+ suites: Optional[str] = None
1238
+ suite: Optional[str] = None
1239
+ if args.suite and (args.release or args.suites):
1240
+ raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1241
+ elif args.suite:
1242
+ # Comment this out while we have a trial period for the `release` method.
1243
+ # hlog(
1244
+ # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1245
+ # "where --release specifies the name of a release and --suites specifies several run suites "
1246
+ # "to be included in that release."
1247
+ # )
1248
+ suite = args.suite
1249
+ elif args.release or args.suites:
1250
+ if not args.release or not args.suites:
1251
+ raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1252
+ release = args.release
1253
+ suites = args.suites
1254
+ else:
1255
+ raise ValueError("Exactly one of --release or --suite must be specified.")
1256
+
1257
+ schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1258
+
1259
+ register_builtin_configs_from_helm_package()
1260
+ register_configs_from_directory(args.local_path)
1261
+
1262
+ # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1263
+ summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1264
+ summarizer = summarizer_cls(
1265
+ release=release,
1266
+ suites=suites,
1267
+ suite=suite,
1268
+ schema_path=schema_path,
1269
+ output_path=args.output_path,
1270
+ verbose=args.debug,
1271
+ num_threads=args.num_threads,
1272
+ allow_unknown_models=args.allow_unknown_models,
1273
+ )
1274
+ summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1275
+ hlog("Done.")
1276
+
1277
+
1235
1278
  def main():
1236
1279
  parser = argparse.ArgumentParser()
1237
1280
  parser.add_argument(
1238
- "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
1281
+ "-o",
1282
+ "--output-path",
1283
+ type=str,
1284
+ help="Where the benchmarking output lives",
1285
+ default="benchmark_output",
1239
1286
  )
1240
1287
  parser.add_argument(
1241
1288
  "--schema-path",
@@ -1253,9 +1300,18 @@ def main():
1253
1300
  help="Experimental: Name of the release this summarization should go under.",
1254
1301
  )
1255
1302
  parser.add_argument(
1256
- "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
1303
+ "--suites",
1304
+ type=str,
1305
+ nargs="+",
1306
+ help="Experimental: List of suites to summarize for this this release.",
1307
+ )
1308
+ parser.add_argument(
1309
+ "-n",
1310
+ "--num-threads",
1311
+ type=int,
1312
+ help="Max number of threads used to summarize",
1313
+ default=8,
1257
1314
  )
1258
- parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
1259
1315
  parser.add_argument(
1260
1316
  "--debug",
1261
1317
  action="store_true",
@@ -1285,47 +1341,8 @@ def main():
1285
1341
  help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
1286
1342
  )
1287
1343
  args = parser.parse_args()
1288
-
1289
- release: Optional[str] = None
1290
- suites: Optional[str] = None
1291
- suite: Optional[str] = None
1292
- if args.suite and (args.release or args.suites):
1293
- raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1294
- elif args.suite:
1295
- # Comment this out while we have a trial period for the `release` method.
1296
- # hlog(
1297
- # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1298
- # "where --release specifies the name of a release and --suites specifies several run suites "
1299
- # "to be included in that release."
1300
- # )
1301
- suite = args.suite
1302
- elif args.release or args.suites:
1303
- if not args.release or not args.suites:
1304
- raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1305
- release = args.release
1306
- suites = args.suites
1307
- else:
1308
- raise ValueError("Exactly one of --release or --suite must be specified.")
1309
-
1310
- schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1311
-
1312
- register_builtin_configs_from_helm_package()
1313
- register_configs_from_directory(args.local_path)
1314
-
1315
- # Output JSON files summarizing the benchmark results which will be loaded in the web interface
1316
- summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
1317
- summarizer = summarizer_cls(
1318
- release=release,
1319
- suites=suites,
1320
- suite=suite,
1321
- schema_path=schema_path,
1322
- output_path=args.output_path,
1323
- verbose=args.debug,
1324
- num_threads=args.num_threads,
1325
- allow_unknown_models=args.allow_unknown_models,
1326
- )
1327
- summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1328
- hlog("Done.")
1344
+ setup_default_logging()
1345
+ summarize(args)
1329
1346
 
1330
1347
 
1331
1348
  if __name__ == "__main__":
@@ -6,7 +6,7 @@ from typing import List
6
6
  from helm.benchmark import model_metadata_registry
7
7
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
8
  from helm.common.general import ensure_directory_exists
9
- from helm.common.hierarchical_logger import hlog, htrack
9
+ from helm.common.hierarchical_logger import hlog, htrack, hwarn
10
10
  from helm.common.authentication import Authentication
11
11
  from helm.proxy.services.remote_service import create_authentication, add_service_args
12
12
 
@@ -191,9 +191,8 @@ def main():
191
191
  )
192
192
 
193
193
  if args.run_specs:
194
- hlog(
195
- "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
196
- "Use --run-entries instead."
194
+ hwarn(
195
+ "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
197
196
  )
198
197
 
199
198
  hlog("Done.")
@@ -12,7 +12,7 @@ from datasets import load_dataset
12
12
 
13
13
  from helm.benchmark.adaptation.request_state import RequestState
14
14
  from helm.common.general import ensure_directory_exists, write, asdict_without_nones
15
- from helm.common.hierarchical_logger import hlog, htrack_block
15
+ from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
16
16
  from helm.common.cache import cache_stats
17
17
  from helm.benchmark.scenarios.scenario import (
18
18
  Scenario,
@@ -193,7 +193,7 @@ class REEvalRunner(Runner):
193
193
  difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
194
194
  prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
195
195
  except ValueError:
196
- hlog(f"WARNING: no available difficulty for {split_name}, skipping")
196
+ hwarn(f"no available difficulty for {split_name}, skipping")
197
197
  return
198
198
 
199
199
  unasked_request_states: List[RequestState] = []
@@ -320,7 +320,7 @@ class REEvalRunner(Runner):
320
320
  metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
321
321
  for metric_name, count in metric_counts.items():
322
322
  if count > 1:
323
- hlog(f"WARNING: duplicate metric name {metric_name}")
323
+ hwarn(f"duplicate metric name {metric_name}")
324
324
 
325
325
  # Print out the number of stats
326
326
  hlog(f"Generated {len(stats)} stats.")
helm/benchmark/run.py CHANGED
@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
9
9
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
10
10
  from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
11
11
  from helm.common.general import ensure_directory_exists
12
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
12
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
13
13
  from helm.common.authentication import Authentication
14
14
  from helm.common.object_spec import parse_object_spec, get_class_by_name
15
15
  from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -200,76 +200,9 @@ def validate_args(args):
200
200
 
201
201
 
202
202
  @htrack(None)
203
- def main():
204
- parser = argparse.ArgumentParser()
205
- add_service_args(parser)
206
- parser.add_argument(
207
- "-c",
208
- "--conf-paths",
209
- nargs="+",
210
- help="Where to read RunSpecs to run from",
211
- default=[],
212
- )
213
- parser.add_argument(
214
- "--models-to-run",
215
- nargs="+",
216
- help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
217
- default=None,
218
- )
219
- parser.add_argument(
220
- "--groups-to-run",
221
- nargs="+",
222
- help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
223
- default=None,
224
- )
225
- parser.add_argument(
226
- "--exit-on-error",
227
- action="store_true",
228
- help="Fail and exit immediately if a particular RunSpec fails.",
229
- )
230
- parser.add_argument(
231
- "--skip-completed-runs",
232
- action="store_true",
233
- help="Skip RunSpecs that have completed i.e. output files exists.",
234
- )
235
- parser.add_argument(
236
- "--priority",
237
- type=int,
238
- default=None,
239
- help="Run RunSpecs with priority less than or equal to this number. "
240
- "If a value for --priority is not specified, run on everything",
241
- )
242
- parser.add_argument(
243
- "--run-specs",
244
- nargs="*",
245
- help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
246
- "Specifies run entries to run.",
247
- default=[],
248
- )
249
- parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
250
- parser.add_argument(
251
- "--enable-huggingface-models",
252
- nargs="+",
253
- default=[],
254
- help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
255
- "Format: namespace/model_name[@revision]",
256
- )
257
- parser.add_argument(
258
- "--enable-local-huggingface-models",
259
- nargs="+",
260
- default=[],
261
- help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
262
- )
263
- parser.add_argument(
264
- "--runner-class-name",
265
- type=str,
266
- default=None,
267
- help="Full class name of the Runner class to use. If unset, uses the default Runner.",
268
- )
269
- add_run_args(parser)
270
- args = parser.parse_args()
271
- validate_args(args)
203
+ def helm_run(args):
272
204
 
205
+ validate_args(args)
273
206
  register_builtin_configs_from_helm_package()
274
207
  register_configs_from_directory(args.local_path)
275
208
 
@@ -358,13 +291,85 @@ def main():
358
291
  )
359
292
 
360
293
  if args.run_specs:
361
- hlog(
362
- "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
363
- "Use --run-entries instead."
294
+ hwarn(
295
+ "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
364
296
  )
365
297
 
366
298
  hlog("Done.")
367
299
 
368
300
 
301
+ # Separate parsing from starting HELM so we can setup logging
302
+ def main():
303
+ parser = argparse.ArgumentParser()
304
+ add_service_args(parser)
305
+ parser.add_argument(
306
+ "-c",
307
+ "--conf-paths",
308
+ nargs="+",
309
+ help="Where to read RunSpecs to run from",
310
+ default=[],
311
+ )
312
+ parser.add_argument(
313
+ "--models-to-run",
314
+ nargs="+",
315
+ help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
316
+ default=None,
317
+ )
318
+ parser.add_argument(
319
+ "--groups-to-run",
320
+ nargs="+",
321
+ help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
322
+ default=None,
323
+ )
324
+ parser.add_argument(
325
+ "--exit-on-error",
326
+ action="store_true",
327
+ help="Fail and exit immediately if a particular RunSpec fails.",
328
+ )
329
+ parser.add_argument(
330
+ "--skip-completed-runs",
331
+ action="store_true",
332
+ help="Skip RunSpecs that have completed i.e. output files exists.",
333
+ )
334
+ parser.add_argument(
335
+ "--priority",
336
+ type=int,
337
+ default=None,
338
+ help="Run RunSpecs with priority less than or equal to this number. "
339
+ "If a value for --priority is not specified, run on everything",
340
+ )
341
+ parser.add_argument(
342
+ "--run-specs",
343
+ nargs="*",
344
+ help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
345
+ "Specifies run entries to run.",
346
+ default=[],
347
+ )
348
+ parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
349
+ parser.add_argument(
350
+ "--enable-huggingface-models",
351
+ nargs="+",
352
+ default=[],
353
+ help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
354
+ "Format: namespace/model_name[@revision]",
355
+ )
356
+ parser.add_argument(
357
+ "--enable-local-huggingface-models",
358
+ nargs="+",
359
+ default=[],
360
+ help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
361
+ )
362
+ parser.add_argument(
363
+ "--runner-class-name",
364
+ type=str,
365
+ default=None,
366
+ help="Full class name of the Runner class to use. If unset, uses the default Runner.",
367
+ )
368
+ add_run_args(parser)
369
+ args = parser.parse_args()
370
+ setup_default_logging()
371
+ return helm_run(args)
372
+
373
+
369
374
  if __name__ == "__main__":
370
375
  main()