crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
4
  from helm.common.request import Request
5
- from helm.clients.openai_client import OpenAILegacyCompletionsClient
5
+ from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
6
6
  from helm.tokenizers.tokenizer import Tokenizer
7
7
 
8
8
 
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
19
19
  tokenizer_name: str,
20
20
  cache_config: CacheConfig,
21
21
  base_url: Optional[str] = None,
22
+ vllm_model_name: Optional[str] = None,
23
+ **kwargs,
22
24
  ):
23
25
  super().__init__(
24
26
  tokenizer=tokenizer,
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
27
29
  api_key="EMPTY",
28
30
  org_id=None,
29
31
  base_url=base_url,
32
+ openai_model_name=vllm_model_name,
33
+ **kwargs,
30
34
  )
31
35
  self.tokenizer = tokenizer
32
36
  self.tokenizer_name = tokenizer_name
33
-
34
- def _get_model_for_request(self, request: Request) -> str:
35
- # The `model` parameter for vLLM should be the whole model name including the creator organization,
36
- # unlike OpenAI which only uses the model engine.
37
- return request.model
37
+ self.vllm_model_name = vllm_model_name
38
38
 
39
39
  def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
40
40
  raw_request = super()._to_raw_completion_request(request)
41
41
  # This avoids the error: best_of must be 1 when using greedy sampling
42
- if "best_of" in raw_request and raw_request["best_of"] > 1:
42
+ if (
43
+ "temperature" in raw_request
44
+ and raw_request["temperature"] == 0.0
45
+ and "best_of" in raw_request
46
+ and raw_request["best_of"] > 1
47
+ ):
43
48
  raw_request["best_of"] = 1
44
49
  return raw_request
50
+
51
+
52
+ class VLLMChatClient(OpenAIClient):
53
+ """Sends request to a vLLM server using the OpenAI-compatible API.
54
+
55
+ Only uses the Chat Completions API.
56
+
57
+ See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
58
+
59
+ def __init__(
60
+ self,
61
+ tokenizer: Tokenizer,
62
+ tokenizer_name: str,
63
+ cache_config: CacheConfig,
64
+ base_url: Optional[str] = None,
65
+ vllm_model_name: Optional[str] = None,
66
+ **kwargs,
67
+ ):
68
+ super().__init__(
69
+ tokenizer=tokenizer,
70
+ tokenizer_name=tokenizer_name,
71
+ cache_config=cache_config,
72
+ api_key="EMPTY",
73
+ org_id=None,
74
+ base_url=base_url,
75
+ openai_model_name=vllm_model_name,
76
+ **kwargs,
77
+ )
78
+ self.tokenizer = tokenizer
79
+ self.tokenizer_name = tokenizer_name
80
+ self.vllm_model_name = vllm_model_name
@@ -0,0 +1,56 @@
1
+ from dataclasses import replace
2
+ import re
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ from helm.clients.vllm_client import VLLMChatClient
6
+ from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
7
+
8
+
9
+ class VLLMGraniteThinkingClient(VLLMChatClient):
10
+ """Sends request to a Granite model on vLLM server with thinking enabled.
11
+
12
+ From vLLM documentation at
13
+ https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
14
+
15
+ IBM Granite 3.2 reasoning is disabled by default;
16
+ to enable it, you must also pass thinking=True in your chat_template_kwargs.
17
+ """
18
+
19
+ def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
20
+ raw_request = super()._make_chat_raw_request(request)
21
+ raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
22
+ return raw_request
23
+
24
+ def _parse_thinking(self, input: str) -> Tuple[str, str]:
25
+ """Return a tuple of thinking text and output text."""
26
+ match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
27
+ if match:
28
+ return (match.group(1), match.group(2))
29
+
30
+ match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
31
+ if match:
32
+ return (match.group(1), match.group(2))
33
+
34
+ match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
35
+ if match:
36
+ return (match.group(1), "")
37
+
38
+ match = re.match(r"<think>(.*)", input, re.DOTALL)
39
+ if match:
40
+ return (match.group(1), "")
41
+
42
+ return (input, "")
43
+
44
+ def _make_chat_request(self, request: Request) -> RequestResult:
45
+ request_result = super()._make_chat_request(request)
46
+ modified_completions: List[GeneratedOutput] = []
47
+ for completion in request_result.completions:
48
+ thinking, modified_text = self._parse_thinking(completion.text)
49
+ modified_completions.append(
50
+ replace(
51
+ completion,
52
+ text=modified_text,
53
+ thinking=Thinking(text=thinking),
54
+ )
55
+ )
56
+ return replace(request_result, completions=modified_completions)
@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
6
6
  class QuestionType:
7
7
  """String enum of question types."""
8
8
 
9
- # TODO: Make this a StrEnum after upgrading to Python 3.11
10
9
  MULTIPLE_CHOICE: str = "multiple_choice"
11
10
  CHECKBOX: str = "checkbox"
12
11
  FREE_RESPONSE: str = "free_response"
@@ -1,4 +1,7 @@
1
1
  import logging
2
+ import logging.config
3
+ import yaml
4
+ import os
2
5
  import sys
3
6
  import time
4
7
  from typing import Any, Callable, List, Optional
@@ -34,22 +37,31 @@ class HierarchicalLogger(object):
34
37
  def indent(self) -> str:
35
38
  return " " * len(self.start_times)
36
39
 
37
- def track_begin(self, x: Any) -> None:
38
- self.logger.info(self.indent() + str(x) + " {")
40
+ def track_begin(self, x: Any, **kwargs) -> None:
41
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
42
+ self.logger.info(self.indent() + str(x) + " {", **kwargs)
39
43
  sys.stdout.flush()
40
44
  self.start_times.append(time.time())
41
45
 
42
- def track_end(self) -> None:
46
+ def track_end(self, **kwargs) -> None:
47
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
43
48
  t = time.time() - self.start_times.pop()
44
- self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
49
+ self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
45
50
  sys.stdout.flush()
46
51
 
47
- def log(self, x: Any) -> None:
48
- self.logger.info(self.indent() + str(x))
52
+ def log(self, x: Any, **kwargs) -> None:
53
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
54
+ self.logger.info(self.indent() + str(x), **kwargs)
49
55
  sys.stdout.flush()
50
56
 
51
- def warn(self, x: Any) -> None:
52
- self.logger.warning(self.indent() + str(x))
57
+ def debug(self, x: Any, **kwargs) -> None:
58
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
59
+ self.logger.debug(self.indent() + str(x), **kwargs)
60
+ sys.stdout.flush()
61
+
62
+ def warn(self, x: Any, **kwargs) -> None:
63
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
64
+ self.logger.warning(self.indent() + str(x), **kwargs)
53
65
  sys.stdout.flush()
54
66
 
55
67
 
@@ -69,23 +81,31 @@ singleton = HierarchicalLogger()
69
81
  # Exposed public methods
70
82
 
71
83
 
72
- def hlog(x: Any) -> None:
73
- singleton.log(x)
84
+ def hdebug(x: Any, **kwargs) -> None:
85
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
86
+ singleton.debug(x, **kwargs)
87
+
88
+
89
+ def hlog(x: Any, **kwargs) -> None:
90
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
91
+ singleton.log(x, **kwargs)
74
92
 
75
93
 
76
- def hwarn(x: Any) -> None:
77
- singleton.warn(x)
94
+ def hwarn(x: Any, **kwargs) -> None:
95
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
96
+ singleton.warn(x, **kwargs)
78
97
 
79
98
 
80
99
  class htrack_block:
81
- def __init__(self, x: Any) -> None:
100
+ def __init__(self, x: Any, stacklevel=1) -> None:
101
+ self._stacklevel = stacklevel + 1
82
102
  self.x = x
83
103
 
84
104
  def __enter__(self) -> None:
85
- singleton.track_begin(self.x)
105
+ singleton.track_begin(self.x, stacklevel=self._stacklevel)
86
106
 
87
107
  def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
88
- singleton.track_end()
108
+ singleton.track_end(stacklevel=self._stacklevel)
89
109
 
90
110
 
91
111
  class htrack:
@@ -116,34 +136,63 @@ class htrack:
116
136
  description = description.replace("$" + k, str(v))
117
137
  else:
118
138
  description = ""
119
- with htrack_block(parent + fn.__name__ + description):
139
+ with htrack_block(parent + fn.__name__ + description, stacklevel=2):
120
140
  return fn(*args, **kwargs)
121
141
 
122
142
  return wrapper
123
143
 
124
144
 
125
- def setup_default_logging():
145
+ def setup_default_logging(config_path: Optional[str] = None):
126
146
  """
127
- Setup a default logger to STDOUT for HELM via Python logging
128
- """
129
- formatter = ColoredFormatter(
130
- "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
131
- datefmt="%Y-%m-%dT%H:%M:%S",
132
- reset=True,
133
- log_colors={
134
- "DEBUG": "cyan",
135
- "INFO": "green",
136
- "WARNING": "yellow",
137
- "ERROR": "red",
138
- "CRITICAL": "red,bg_white",
139
- },
140
- secondary_log_colors={},
141
- style="%",
142
- )
147
+ Setup Python logging for HELM
143
148
 
149
+ Priority:
150
+ 1. External config file (YAML or JSON).
151
+ 2. ENV var LOG_LEVEL.
152
+ 3. a default logger to STDOUT
153
+ """
144
154
  logger = logging.getLogger("helm")
145
- logger.setLevel(logging.INFO)
146
155
  logger.propagate = False
156
+
157
+ if config_path and os.path.exists(config_path):
158
+ with open(config_path, "r") as f:
159
+ config = yaml.safe_load(f)
160
+ logging.config.dictConfig(config)
161
+ hdebug("setup custom HELM logging")
162
+ return
163
+
164
+ log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
165
+ try:
166
+ logger.setLevel(getattr(logging, log_level))
167
+ except AttributeError:
168
+ logger.setLevel(logging.INFO)
169
+
170
+ # Set formatter
171
+ formatter: Optional[logging.Formatter] = None
172
+ if sys.stdout.isatty():
173
+ try:
174
+ formatter = ColoredFormatter(
175
+ "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
176
+ datefmt="%Y-%m-%dT%H:%M:%S",
177
+ reset=True,
178
+ log_colors={
179
+ "DEBUG": "cyan",
180
+ "INFO": "green",
181
+ "WARNING": "yellow",
182
+ "ERROR": "red",
183
+ "CRITICAL": "red,bg_white",
184
+ },
185
+ style="%",
186
+ )
187
+ except ImportError:
188
+ pass
189
+
190
+ if formatter is None:
191
+ # fallback
192
+ formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
193
+
194
+ # Add default stdout handler
147
195
  handler = logging.StreamHandler(sys.stdout)
148
196
  handler.setFormatter(formatter)
149
197
  logger.addHandler(handler)
198
+ hdebug("setup default HELM logging")
@@ -55,14 +55,23 @@ def inject_object_spec_args(
55
55
  This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
56
56
 
57
57
  Example:
58
-
59
- class MyClass:
60
- def __init__(a: int, b: int, c: int, d: int = 0):
61
- pass
62
-
63
- old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
64
- new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
65
- # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
58
+ >>> from helm.common.object_spec import * # NOQA
59
+ >>> import sys, types
60
+ >>> # Given a custom class with hashable arguments
61
+ >>> class MyClass:
62
+ ... def __init__(a: int, b: int, c: int, d: int = 0):
63
+ ... pass
64
+ >>> #
65
+ >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
66
+ >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
67
+ >>> # </boilerplate>
68
+ >>> #
69
+ >>> # Define new style and old style object specs
70
+ >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
71
+ >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
72
+ >>> # new_object_spec is now
73
+ >>> print(new_object_spec)
74
+ ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
66
75
  """
67
76
  cls = get_class_by_name(spec.class_name)
68
77
  init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
93
102
  <class_name>:<key>=<value>,<key>=<value>
94
103
  Usually, the description is something that's succinct and can be typed on the command-line.
95
104
  Here, value defaults to string.
105
+
106
+ Example:
107
+ >>> from helm.common.object_spec import * # NOQA
108
+ >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
109
+ >>> parse_object_spec(description)
110
+ ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
96
111
  """
97
112
 
98
113
  def parse_arg(arg: str) -> Tuple[str, Any]:
@@ -0,0 +1,94 @@
1
+ import sys
2
+ import tempfile
3
+ import textwrap
4
+ import pathlib
5
+ from helm.benchmark import run
6
+ from typing import List, Optional
7
+
8
+
9
+ class ArgvContext:
10
+ """
11
+ Helper to assign a temporary value to sys.argv and then restore it
12
+ """
13
+
14
+ def __init__(self, argv: Optional[List[str]]):
15
+ self.argv = argv
16
+ self._original_argv: Optional[List[str]] = None
17
+
18
+ def __enter__(self):
19
+ self._original_argv = sys.argv[:]
20
+ sys.argv = self.argv or []
21
+
22
+ def __exit__(self, exc_type, exc_val, exc_tb):
23
+ assert self._original_argv is not None # Satisfies mypy
24
+ sys.argv = self._original_argv
25
+
26
+
27
+ def test_run_with_custom_logging_config():
28
+ # Setup temporary directory
29
+ with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
30
+ tmp_dir = pathlib.Path(tmp_dir_str)
31
+ log_path = tmp_dir / "test.log"
32
+ log_config_path = tmp_dir / "test_config.yaml"
33
+
34
+ # Write custom YAML log config to file
35
+ log_config_text = textwrap.dedent(
36
+ f"""
37
+ version: 1
38
+ disable_existing_loggers: false
39
+ formatters:
40
+ simple:
41
+ datefmt: '%Y-%m-%dT%H:%M:%S'
42
+ format: '%(asctime)s %(levelname)s %(name)s %(message)s'
43
+ handlers:
44
+ file:
45
+ class: logging.FileHandler
46
+ filename: {log_path}
47
+ formatter: simple
48
+ level: DEBUG
49
+ mode: w
50
+ loggers:
51
+ helm:
52
+ handlers:
53
+ - file
54
+ level: DEBUG
55
+ propagate: false
56
+ """
57
+ ).strip()
58
+
59
+ log_config_path.write_text(log_config_text)
60
+
61
+ # Simulate command-line arguments
62
+ argv = [
63
+ "run.py", # Fake script name
64
+ "--run-entries",
65
+ "mmlu:subject=philosophy,model=openai/gpt2",
66
+ "-m",
67
+ "1",
68
+ "--suite",
69
+ "my-suite",
70
+ "--dry-run",
71
+ "--log-config",
72
+ str(log_config_path),
73
+ ]
74
+
75
+ # Call main
76
+ with ArgvContext(argv):
77
+ run.main()
78
+
79
+ # Check log file contents
80
+ assert log_path.exists(), "Log file was not created"
81
+ log_contents = log_path.read_text()
82
+
83
+ # Test that log file was written to disk as requested
84
+ print("Log Contents")
85
+ print("------------")
86
+ print(log_contents)
87
+
88
+ assert (
89
+ "mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
90
+ ), "Expected log content not found in log file:\n"
91
+
92
+
93
+ if __name__ == "__main__":
94
+ test_run_with_custom_logging_config()