crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
helm/common/general.py CHANGED
@@ -42,6 +42,13 @@ def ensure_directory_exists(path: str):
42
42
  os.makedirs(path, exist_ok=True)
43
43
 
44
44
 
45
+ def check_file_exists(path: str, msg: Optional[str] = None):
46
+ """Checks that `path` exists, raises FileNotFoundError if it doesn't."""
47
+ if not os.path.exists(path):
48
+ error_msg = msg if msg else f"Required file not found: {path}"
49
+ raise FileNotFoundError(error_msg)
50
+
51
+
45
52
  def parse_hocon(text: str):
46
53
  """Parse `text` (in HOCON format) into a dict-like object."""
47
54
  return pyhocon.ConfigFactory.parse_string(text)
@@ -156,7 +163,7 @@ def format_split(split: str) -> str:
156
163
 
157
164
 
158
165
  def asdict_without_nones(obj: Any) -> Dict[str, Any]:
159
- if not is_dataclass(obj):
166
+ if not is_dataclass(obj) or isinstance(obj, type):
160
167
  raise ValueError(f"Expected dataclass, got '{obj}'")
161
168
  return asdict(obj, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
162
169
 
@@ -178,7 +185,7 @@ def binarize_dict(d: Dict[str, int]) -> Dict[str, int]:
178
185
 
179
186
  def serialize(obj: Any) -> List[str]:
180
187
  """Takes in a dataclass and outputs all of its fields and values in a list."""
181
- if not is_dataclass(obj):
188
+ if not is_dataclass(obj) or isinstance(obj, type):
182
189
  raise ValueError(f"Expected dataclass, got '{obj}'")
183
190
  return [f"{key}: {json.dumps(value)}" for key, value in asdict(obj).items()]
184
191
 
@@ -1,6 +1,11 @@
1
+ import logging
2
+ import logging.config
3
+ import yaml
4
+ import os
1
5
  import sys
2
6
  import time
3
7
  from typing import Any, Callable, List, Optional
8
+ from colorlog import ColoredFormatter
4
9
 
5
10
 
6
11
  class HierarchicalLogger(object):
@@ -20,24 +25,43 @@ class HierarchicalLogger(object):
20
25
  } [0s]
21
26
  """
22
27
 
28
+ # Far too much effort to unwind every call to hlog to go via logging,
29
+ # And is a terrible idea to inspect the stack every time hlog is called
30
+ # to figure out the caller,
31
+ # So just log everything under "helm".
32
+ logger = logging.getLogger("helm")
33
+
23
34
  def __init__(self) -> None:
24
35
  self.start_times: List[float] = []
25
36
 
26
37
  def indent(self) -> str:
27
38
  return " " * len(self.start_times)
28
39
 
29
- def track_begin(self, x: Any) -> None:
30
- print(self.indent() + str(x) + " {")
40
+ def track_begin(self, x: Any, **kwargs) -> None:
41
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
42
+ self.logger.info(self.indent() + str(x) + " {", **kwargs)
31
43
  sys.stdout.flush()
32
44
  self.start_times.append(time.time())
33
45
 
34
- def track_end(self) -> None:
46
+ def track_end(self, **kwargs) -> None:
47
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
35
48
  t = time.time() - self.start_times.pop()
36
- print(self.indent() + "} [%s]" % (format_time(t)))
49
+ self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
50
+ sys.stdout.flush()
51
+
52
+ def log(self, x: Any, **kwargs) -> None:
53
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
54
+ self.logger.info(self.indent() + str(x), **kwargs)
37
55
  sys.stdout.flush()
38
56
 
39
- def log(self, x: Any) -> None:
40
- print(self.indent() + str(x))
57
+ def debug(self, x: Any, **kwargs) -> None:
58
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
59
+ self.logger.debug(self.indent() + str(x), **kwargs)
60
+ sys.stdout.flush()
61
+
62
+ def warn(self, x: Any, **kwargs) -> None:
63
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
64
+ self.logger.warning(self.indent() + str(x), **kwargs)
41
65
  sys.stdout.flush()
42
66
 
43
67
 
@@ -57,19 +81,31 @@ singleton = HierarchicalLogger()
57
81
  # Exposed public methods
58
82
 
59
83
 
60
- def hlog(x: Any) -> None:
61
- singleton.log(x)
84
+ def hdebug(x: Any, **kwargs) -> None:
85
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
86
+ singleton.debug(x, **kwargs)
87
+
88
+
89
+ def hlog(x: Any, **kwargs) -> None:
90
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
91
+ singleton.log(x, **kwargs)
92
+
93
+
94
+ def hwarn(x: Any, **kwargs) -> None:
95
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
96
+ singleton.warn(x, **kwargs)
62
97
 
63
98
 
64
99
  class htrack_block:
65
- def __init__(self, x: Any) -> None:
100
+ def __init__(self, x: Any, stacklevel=1) -> None:
101
+ self._stacklevel = stacklevel + 1
66
102
  self.x = x
67
103
 
68
104
  def __enter__(self) -> None:
69
- singleton.track_begin(self.x)
105
+ singleton.track_begin(self.x, stacklevel=self._stacklevel)
70
106
 
71
107
  def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
72
- singleton.track_end()
108
+ singleton.track_end(stacklevel=self._stacklevel)
73
109
 
74
110
 
75
111
  class htrack:
@@ -100,7 +136,63 @@ class htrack:
100
136
  description = description.replace("$" + k, str(v))
101
137
  else:
102
138
  description = ""
103
- with htrack_block(parent + fn.__name__ + description):
139
+ with htrack_block(parent + fn.__name__ + description, stacklevel=2):
104
140
  return fn(*args, **kwargs)
105
141
 
106
142
  return wrapper
143
+
144
+
145
+ def setup_default_logging(config_path: Optional[str] = None):
146
+ """
147
+ Setup Python logging for HELM
148
+
149
+ Priority:
150
+ 1. External config file (YAML or JSON).
151
+ 2. ENV var LOG_LEVEL.
152
+ 3. a default logger to STDOUT
153
+ """
154
+ logger = logging.getLogger("helm")
155
+ logger.propagate = False
156
+
157
+ if config_path and os.path.exists(config_path):
158
+ with open(config_path, "r") as f:
159
+ config = yaml.safe_load(f)
160
+ logging.config.dictConfig(config)
161
+ hdebug("setup custom HELM logging")
162
+ return
163
+
164
+ log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
165
+ try:
166
+ logger.setLevel(getattr(logging, log_level))
167
+ except AttributeError:
168
+ logger.setLevel(logging.INFO)
169
+
170
+ # Set formatter
171
+ formatter: Optional[logging.Formatter] = None
172
+ if sys.stdout.isatty():
173
+ try:
174
+ formatter = ColoredFormatter(
175
+ "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
176
+ datefmt="%Y-%m-%dT%H:%M:%S",
177
+ reset=True,
178
+ log_colors={
179
+ "DEBUG": "cyan",
180
+ "INFO": "green",
181
+ "WARNING": "yellow",
182
+ "ERROR": "red",
183
+ "CRITICAL": "red,bg_white",
184
+ },
185
+ style="%",
186
+ )
187
+ except ImportError:
188
+ pass
189
+
190
+ if formatter is None:
191
+ # fallback
192
+ formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
193
+
194
+ # Add default stdout handler
195
+ handler = logging.StreamHandler(sys.stdout)
196
+ handler.setFormatter(formatter)
197
+ logger.addHandler(handler)
198
+ hdebug("setup default HELM logging")
@@ -0,0 +1,140 @@
1
+ import dataclasses
2
+ import os
3
+ from typing import Optional
4
+
5
+ from helm.common.context import Context
6
+ from helm.common.cache import CacheConfig
7
+ from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
8
+ from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
9
+ from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
10
+ from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
11
+ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
12
+ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
13
+ from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
14
+ from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
15
+ from helm.common.tokenization_request import (
16
+ TokenizationRequest,
17
+ TokenizationRequestResult,
18
+ DecodeRequest,
19
+ DecodeRequestResult,
20
+ )
21
+ from helm.common.request import Request, RequestResult
22
+ from helm.clients.auto_client import AutoClient
23
+ from helm.clients.moderation_api_client import ModerationAPIClient
24
+ from helm.clients.image_generation.nudity_check_client import NudityCheckClient
25
+ from helm.clients.gcs_client import GCSClient
26
+ from helm.clients.clip_score_client import CLIPScoreClient
27
+ from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
28
+ from helm.proxy.example_queries import example_queries
29
+ from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
30
+ from helm.proxy.query import Query, QueryResult
31
+ from helm.proxy.retry import retry_request
32
+ from helm.tokenizers.auto_tokenizer import AutoTokenizer
33
+ from helm.proxy.services.service import (
34
+ CACHE_DIR,
35
+ GeneralInfo,
36
+ VERSION,
37
+ expand_environments,
38
+ synthesize_request,
39
+ )
40
+
41
+
42
+ class LocalContext(Context):
43
+ """
44
+ Main class that supports various functionality for the server.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ base_path: str = "prod_env",
50
+ cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
51
+ ):
52
+ ensure_directory_exists(base_path)
53
+ client_file_storage_path = os.path.join(base_path, CACHE_DIR)
54
+ ensure_directory_exists(client_file_storage_path)
55
+
56
+ credentials = get_credentials(base_path)
57
+
58
+ self.cache_backend_config = cache_backend_config
59
+ self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
60
+ self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
61
+
62
+ # Lazily instantiate the following clients
63
+ self.moderation_api_client: Optional[ModerationAPIClient] = None
64
+ self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
65
+ self.perspective_api_client: Optional[ToxicityClassifierClient] = None
66
+ self.nudity_check_client: Optional[NudityCheckClient] = None
67
+ self.clip_score_client: Optional[CLIPScoreClient] = None
68
+ self.gcs_client: Optional[GCSClient] = None
69
+
70
+ def get_general_info(self) -> GeneralInfo:
71
+ # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
72
+ # TODO(#2158): Either fix this or delete get_general_info.
73
+ all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
74
+ return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
75
+
76
+ def expand_query(self, query: Query) -> QueryResult:
77
+ """Turn the `query` into requests."""
78
+ prompt = query.prompt
79
+ settings = query.settings
80
+ environments = parse_hocon(query.environments)
81
+ requests = []
82
+ for environment in expand_environments(environments):
83
+ request = synthesize_request(prompt, settings, environment)
84
+ requests.append(request)
85
+ return QueryResult(requests=requests)
86
+
87
+ def make_request(self, request: Request) -> RequestResult:
88
+ """Actually make a request to an API."""
89
+ return self.client.make_request(request)
90
+
91
+ def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
92
+ return self.tokenizer.tokenize(request)
93
+
94
+ def decode(self, request: DecodeRequest) -> DecodeRequestResult:
95
+ return self.tokenizer.decode(request)
96
+
97
+ def upload(self, request: FileUploadRequest) -> FileUploadResult:
98
+ if not self.gcs_client:
99
+ self.gcs_client = self.client.get_gcs_client()
100
+
101
+ assert self.gcs_client
102
+ return self.gcs_client.upload(request)
103
+
104
+ def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
105
+ if not self.nudity_check_client:
106
+ self.nudity_check_client = self.client.get_nudity_check_client()
107
+
108
+ assert self.nudity_check_client
109
+ return self.nudity_check_client.check_nudity(request)
110
+
111
+ def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
112
+ if not self.clip_score_client:
113
+ self.clip_score_client = self.client.get_clip_score_client()
114
+
115
+ assert self.clip_score_client
116
+ return self.clip_score_client.compute_score(request)
117
+
118
+ def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
119
+ @retry_request
120
+ def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
121
+ if not self.toxicity_classifier_client:
122
+ self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
123
+ return self.toxicity_classifier_client.get_toxicity_scores(request)
124
+
125
+ return get_toxicity_scores_with_retry(request)
126
+
127
+ def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
128
+ @retry_request
129
+ def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
130
+ if not self.moderation_api_client:
131
+ self.moderation_api_client = self.client.get_moderation_api_client()
132
+ return self.moderation_api_client.get_moderation_results(request)
133
+
134
+ return get_moderation_results_with_retry(request)
135
+
136
+ def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
137
+ return self.client.get_critique_client().make_critique_request(request)
138
+
139
+ def get_cache_config(self, shard_name: str) -> CacheConfig:
140
+ return self.cache_backend_config.get_cache_config(shard_name)
@@ -55,14 +55,23 @@ def inject_object_spec_args(
55
55
  This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
56
56
 
57
57
  Example:
58
-
59
- class MyClass:
60
- def __init__(a: int, b: int, c: int, d: int = 0):
61
- pass
62
-
63
- old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
64
- new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
65
- # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
58
+ >>> from helm.common.object_spec import * # NOQA
59
+ >>> import sys, types
60
+ >>> # Given a custom class with hashable arguments
61
+ >>> class MyClass:
62
+ ... def __init__(a: int, b: int, c: int, d: int = 0):
63
+ ... pass
64
+ >>> #
65
+ >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
66
+ >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
67
+ >>> # </boilerplate>
68
+ >>> #
69
+ >>> # Define new style and old style object specs
70
+ >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
71
+ >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
72
+ >>> # new_object_spec is now
73
+ >>> print(new_object_spec)
74
+ ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
66
75
  """
67
76
  cls = get_class_by_name(spec.class_name)
68
77
  init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
93
102
  <class_name>:<key>=<value>,<key>=<value>
94
103
  Usually, the description is something that's succinct and can be typed on the command-line.
95
104
  Here, value defaults to string.
105
+
106
+ Example:
107
+ >>> from helm.common.object_spec import * # NOQA
108
+ >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
109
+ >>> parse_object_spec(description)
110
+ ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
96
111
  """
97
112
 
98
113
  def parse_arg(arg: str) -> Tuple[str, Any]:
@@ -0,0 +1,61 @@
1
+ from helm.common.context import Context
2
+ from helm.common.cache import CacheConfig
3
+ from helm.common.authentication import Authentication
4
+ from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
5
+ from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
6
+ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
7
+ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
8
+ from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
9
+ from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
10
+ from helm.common.tokenization_request import (
11
+ TokenizationRequest,
12
+ TokenizationRequestResult,
13
+ DecodeRequestResult,
14
+ DecodeRequest,
15
+ )
16
+ from helm.common.request import Request, RequestResult
17
+ from helm.proxy.query import Query, QueryResult
18
+ from helm.proxy.services.remote_service import RemoteService
19
+ from helm.proxy.services.service import GeneralInfo, Service
20
+
21
+
22
+ class RemoteContext(Context):
23
+ def __init__(self, base_url: str, auth: Authentication):
24
+ self.service: Service = RemoteService(base_url)
25
+ self.auth = auth
26
+
27
+ def get_general_info(self) -> GeneralInfo:
28
+ return self.service.get_general_info()
29
+
30
+ def expand_query(self, query: Query) -> QueryResult:
31
+ return self.service.expand_query(query)
32
+
33
+ def make_request(self, request: Request) -> RequestResult:
34
+ return self.service.make_request(self.auth, request)
35
+
36
+ def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
37
+ return self.service.tokenize(self.auth, request)
38
+
39
+ def decode(self, request: DecodeRequest) -> DecodeRequestResult:
40
+ return self.service.decode(self.auth, request)
41
+
42
+ def upload(self, request: FileUploadRequest) -> FileUploadResult:
43
+ return self.service.upload(self.auth, request)
44
+
45
+ def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
46
+ return self.service.check_nudity(self.auth, request)
47
+
48
+ def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
49
+ return self.service.compute_clip_score(self.auth, request)
50
+
51
+ def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
52
+ return self.service.get_toxicity_scores(self.auth, request)
53
+
54
+ def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
55
+ return self.service.get_moderation_results(self.auth, request)
56
+
57
+ def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
58
+ return self.service.make_critique_request(self.auth, request)
59
+
60
+ def get_cache_config(self, shard_name: str) -> CacheConfig:
61
+ return self.service.get_cache_config(shard_name)
helm/common/request.py CHANGED
@@ -131,6 +131,11 @@ class Token:
131
131
  ]
132
132
 
133
133
 
134
+ @dataclass(frozen=True)
135
+ class Thinking:
136
+ text: Optional[str] = None
137
+
138
+
134
139
  @dataclass(frozen=True)
135
140
  class GeneratedOutput:
136
141
  """A `GeneratedOutput` is a single generated output that may contain text or multimodal content."""
@@ -150,6 +155,9 @@ class GeneratedOutput:
150
155
  # Could be a sequence made up of multimedia content
151
156
  multimodal_content: Optional[MultimediaObject] = None
152
157
 
158
+ # Could be reasoning
159
+ thinking: Optional[Thinking] = None
160
+
153
161
  def __add__(self, other: "GeneratedOutput") -> "GeneratedOutput":
154
162
  return GeneratedOutput(self.text + other.text, self.logprob + other.logprob, self.tokens + other.tokens)
155
163
 
@@ -0,0 +1,94 @@
1
+ import sys
2
+ import tempfile
3
+ import textwrap
4
+ import pathlib
5
+ from helm.benchmark import run
6
+ from typing import List, Optional
7
+
8
+
9
+ class ArgvContext:
10
+ """
11
+ Helper to assign a temporary value to sys.argv and then restore it
12
+ """
13
+
14
+ def __init__(self, argv: Optional[List[str]]):
15
+ self.argv = argv
16
+ self._original_argv: Optional[List[str]] = None
17
+
18
+ def __enter__(self):
19
+ self._original_argv = sys.argv[:]
20
+ sys.argv = self.argv or []
21
+
22
+ def __exit__(self, exc_type, exc_val, exc_tb):
23
+ assert self._original_argv is not None # Satisfies mypy
24
+ sys.argv = self._original_argv
25
+
26
+
27
+ def test_run_with_custom_logging_config():
28
+ # Setup temporary directory
29
+ with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
30
+ tmp_dir = pathlib.Path(tmp_dir_str)
31
+ log_path = tmp_dir / "test.log"
32
+ log_config_path = tmp_dir / "test_config.yaml"
33
+
34
+ # Write custom YAML log config to file
35
+ log_config_text = textwrap.dedent(
36
+ f"""
37
+ version: 1
38
+ disable_existing_loggers: false
39
+ formatters:
40
+ simple:
41
+ datefmt: '%Y-%m-%dT%H:%M:%S'
42
+ format: '%(asctime)s %(levelname)s %(name)s %(message)s'
43
+ handlers:
44
+ file:
45
+ class: logging.FileHandler
46
+ filename: {log_path}
47
+ formatter: simple
48
+ level: DEBUG
49
+ mode: w
50
+ loggers:
51
+ helm:
52
+ handlers:
53
+ - file
54
+ level: DEBUG
55
+ propagate: false
56
+ """
57
+ ).strip()
58
+
59
+ log_config_path.write_text(log_config_text)
60
+
61
+ # Simulate command-line arguments
62
+ argv = [
63
+ "run.py", # Fake script name
64
+ "--run-entries",
65
+ "mmlu:subject=philosophy,model=openai/gpt2",
66
+ "-m",
67
+ "1",
68
+ "--suite",
69
+ "my-suite",
70
+ "--dry-run",
71
+ "--log-config",
72
+ str(log_config_path),
73
+ ]
74
+
75
+ # Call main
76
+ with ArgvContext(argv):
77
+ run.main()
78
+
79
+ # Check log file contents
80
+ assert log_path.exists(), "Log file was not created"
81
+ log_contents = log_path.read_text()
82
+
83
+ # Test that log file was written to disk as requested
84
+ print("Log Contents")
85
+ print("------------")
86
+ print(log_contents)
87
+
88
+ assert (
89
+ "mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
90
+ ), "Expected log content not found in log file:\n"
91
+
92
+
93
+ if __name__ == "__main__":
94
+ test_run_with_custom_logging_config()