crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
helm/clients/vllm_client.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import CacheConfig
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from helm.clients.openai_client import OpenAILegacyCompletionsClient
|
|
5
|
+
from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
|
|
6
6
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
7
|
|
|
8
8
|
|
|
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
|
|
|
19
19
|
tokenizer_name: str,
|
|
20
20
|
cache_config: CacheConfig,
|
|
21
21
|
base_url: Optional[str] = None,
|
|
22
|
+
vllm_model_name: Optional[str] = None,
|
|
23
|
+
**kwargs,
|
|
22
24
|
):
|
|
23
25
|
super().__init__(
|
|
24
26
|
tokenizer=tokenizer,
|
|
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
|
|
|
27
29
|
api_key="EMPTY",
|
|
28
30
|
org_id=None,
|
|
29
31
|
base_url=base_url,
|
|
32
|
+
openai_model_name=vllm_model_name,
|
|
33
|
+
**kwargs,
|
|
30
34
|
)
|
|
31
35
|
self.tokenizer = tokenizer
|
|
32
36
|
self.tokenizer_name = tokenizer_name
|
|
33
|
-
|
|
34
|
-
def _get_model_for_request(self, request: Request) -> str:
|
|
35
|
-
# The `model` parameter for vLLM should be the whole model name including the creator organization,
|
|
36
|
-
# unlike OpenAI which only uses the model engine.
|
|
37
|
-
return request.model
|
|
37
|
+
self.vllm_model_name = vllm_model_name
|
|
38
38
|
|
|
39
39
|
def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
|
|
40
40
|
raw_request = super()._to_raw_completion_request(request)
|
|
41
41
|
# This avoids the error: best_of must be 1 when using greedy sampling
|
|
42
|
-
if
|
|
42
|
+
if (
|
|
43
|
+
"temperature" in raw_request
|
|
44
|
+
and raw_request["temperature"] == 0.0
|
|
45
|
+
and "best_of" in raw_request
|
|
46
|
+
and raw_request["best_of"] > 1
|
|
47
|
+
):
|
|
43
48
|
raw_request["best_of"] = 1
|
|
44
49
|
return raw_request
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class VLLMChatClient(OpenAIClient):
|
|
53
|
+
"""Sends request to a vLLM server using the OpenAI-compatible API.
|
|
54
|
+
|
|
55
|
+
Only uses the Chat Completions API.
|
|
56
|
+
|
|
57
|
+
See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
tokenizer: Tokenizer,
|
|
62
|
+
tokenizer_name: str,
|
|
63
|
+
cache_config: CacheConfig,
|
|
64
|
+
base_url: Optional[str] = None,
|
|
65
|
+
vllm_model_name: Optional[str] = None,
|
|
66
|
+
**kwargs,
|
|
67
|
+
):
|
|
68
|
+
super().__init__(
|
|
69
|
+
tokenizer=tokenizer,
|
|
70
|
+
tokenizer_name=tokenizer_name,
|
|
71
|
+
cache_config=cache_config,
|
|
72
|
+
api_key="EMPTY",
|
|
73
|
+
org_id=None,
|
|
74
|
+
base_url=base_url,
|
|
75
|
+
openai_model_name=vllm_model_name,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
self.tokenizer = tokenizer
|
|
79
|
+
self.tokenizer_name = tokenizer_name
|
|
80
|
+
self.vllm_model_name = vllm_model_name
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from dataclasses import replace
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
from helm.clients.vllm_client import VLLMChatClient
|
|
6
|
+
from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VLLMGraniteThinkingClient(VLLMChatClient):
|
|
10
|
+
"""Sends request to a Granite model on vLLM server with thinking enabled.
|
|
11
|
+
|
|
12
|
+
From vLLM documentation at
|
|
13
|
+
https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
|
|
14
|
+
|
|
15
|
+
IBM Granite 3.2 reasoning is disabled by default;
|
|
16
|
+
to enable it, you must also pass thinking=True in your chat_template_kwargs.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
|
|
20
|
+
raw_request = super()._make_chat_raw_request(request)
|
|
21
|
+
raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
|
|
22
|
+
return raw_request
|
|
23
|
+
|
|
24
|
+
def _parse_thinking(self, input: str) -> Tuple[str, str]:
|
|
25
|
+
"""Return a tuple of thinking text and output text."""
|
|
26
|
+
match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
|
|
27
|
+
if match:
|
|
28
|
+
return (match.group(1), match.group(2))
|
|
29
|
+
|
|
30
|
+
match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
|
|
31
|
+
if match:
|
|
32
|
+
return (match.group(1), match.group(2))
|
|
33
|
+
|
|
34
|
+
match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
|
|
35
|
+
if match:
|
|
36
|
+
return (match.group(1), "")
|
|
37
|
+
|
|
38
|
+
match = re.match(r"<think>(.*)", input, re.DOTALL)
|
|
39
|
+
if match:
|
|
40
|
+
return (match.group(1), "")
|
|
41
|
+
|
|
42
|
+
return (input, "")
|
|
43
|
+
|
|
44
|
+
def _make_chat_request(self, request: Request) -> RequestResult:
|
|
45
|
+
request_result = super()._make_chat_request(request)
|
|
46
|
+
modified_completions: List[GeneratedOutput] = []
|
|
47
|
+
for completion in request_result.completions:
|
|
48
|
+
thinking, modified_text = self._parse_thinking(completion.text)
|
|
49
|
+
modified_completions.append(
|
|
50
|
+
replace(
|
|
51
|
+
completion,
|
|
52
|
+
text=modified_text,
|
|
53
|
+
thinking=Thinking(text=thinking),
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
return replace(request_result, completions=modified_completions)
|
helm/common/critique_request.py
CHANGED
|
@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
|
|
|
6
6
|
class QuestionType:
|
|
7
7
|
"""String enum of question types."""
|
|
8
8
|
|
|
9
|
-
# TODO: Make this a StrEnum after upgrading to Python 3.11
|
|
10
9
|
MULTIPLE_CHOICE: str = "multiple_choice"
|
|
11
10
|
CHECKBOX: str = "checkbox"
|
|
12
11
|
FREE_RESPONSE: str = "free_response"
|
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import logging.config
|
|
3
|
+
import yaml
|
|
4
|
+
import os
|
|
2
5
|
import sys
|
|
3
6
|
import time
|
|
4
7
|
from typing import Any, Callable, List, Optional
|
|
@@ -34,22 +37,31 @@ class HierarchicalLogger(object):
|
|
|
34
37
|
def indent(self) -> str:
|
|
35
38
|
return " " * len(self.start_times)
|
|
36
39
|
|
|
37
|
-
def track_begin(self, x: Any) -> None:
|
|
38
|
-
|
|
40
|
+
def track_begin(self, x: Any, **kwargs) -> None:
|
|
41
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
42
|
+
self.logger.info(self.indent() + str(x) + " {", **kwargs)
|
|
39
43
|
sys.stdout.flush()
|
|
40
44
|
self.start_times.append(time.time())
|
|
41
45
|
|
|
42
|
-
def track_end(self) -> None:
|
|
46
|
+
def track_end(self, **kwargs) -> None:
|
|
47
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
43
48
|
t = time.time() - self.start_times.pop()
|
|
44
|
-
self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
|
|
49
|
+
self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
|
|
45
50
|
sys.stdout.flush()
|
|
46
51
|
|
|
47
|
-
def log(self, x: Any) -> None:
|
|
48
|
-
|
|
52
|
+
def log(self, x: Any, **kwargs) -> None:
|
|
53
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
54
|
+
self.logger.info(self.indent() + str(x), **kwargs)
|
|
49
55
|
sys.stdout.flush()
|
|
50
56
|
|
|
51
|
-
def
|
|
52
|
-
|
|
57
|
+
def debug(self, x: Any, **kwargs) -> None:
|
|
58
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
59
|
+
self.logger.debug(self.indent() + str(x), **kwargs)
|
|
60
|
+
sys.stdout.flush()
|
|
61
|
+
|
|
62
|
+
def warn(self, x: Any, **kwargs) -> None:
|
|
63
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
64
|
+
self.logger.warning(self.indent() + str(x), **kwargs)
|
|
53
65
|
sys.stdout.flush()
|
|
54
66
|
|
|
55
67
|
|
|
@@ -69,23 +81,31 @@ singleton = HierarchicalLogger()
|
|
|
69
81
|
# Exposed public methods
|
|
70
82
|
|
|
71
83
|
|
|
72
|
-
def
|
|
73
|
-
|
|
84
|
+
def hdebug(x: Any, **kwargs) -> None:
|
|
85
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
86
|
+
singleton.debug(x, **kwargs)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def hlog(x: Any, **kwargs) -> None:
|
|
90
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
91
|
+
singleton.log(x, **kwargs)
|
|
74
92
|
|
|
75
93
|
|
|
76
|
-
def hwarn(x: Any) -> None:
|
|
77
|
-
|
|
94
|
+
def hwarn(x: Any, **kwargs) -> None:
|
|
95
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
96
|
+
singleton.warn(x, **kwargs)
|
|
78
97
|
|
|
79
98
|
|
|
80
99
|
class htrack_block:
|
|
81
|
-
def __init__(self, x: Any) -> None:
|
|
100
|
+
def __init__(self, x: Any, stacklevel=1) -> None:
|
|
101
|
+
self._stacklevel = stacklevel + 1
|
|
82
102
|
self.x = x
|
|
83
103
|
|
|
84
104
|
def __enter__(self) -> None:
|
|
85
|
-
singleton.track_begin(self.x)
|
|
105
|
+
singleton.track_begin(self.x, stacklevel=self._stacklevel)
|
|
86
106
|
|
|
87
107
|
def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
|
|
88
|
-
singleton.track_end()
|
|
108
|
+
singleton.track_end(stacklevel=self._stacklevel)
|
|
89
109
|
|
|
90
110
|
|
|
91
111
|
class htrack:
|
|
@@ -116,34 +136,63 @@ class htrack:
|
|
|
116
136
|
description = description.replace("$" + k, str(v))
|
|
117
137
|
else:
|
|
118
138
|
description = ""
|
|
119
|
-
with htrack_block(parent + fn.__name__ + description):
|
|
139
|
+
with htrack_block(parent + fn.__name__ + description, stacklevel=2):
|
|
120
140
|
return fn(*args, **kwargs)
|
|
121
141
|
|
|
122
142
|
return wrapper
|
|
123
143
|
|
|
124
144
|
|
|
125
|
-
def setup_default_logging():
|
|
145
|
+
def setup_default_logging(config_path: Optional[str] = None):
|
|
126
146
|
"""
|
|
127
|
-
Setup
|
|
128
|
-
"""
|
|
129
|
-
formatter = ColoredFormatter(
|
|
130
|
-
"%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
|
|
131
|
-
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
132
|
-
reset=True,
|
|
133
|
-
log_colors={
|
|
134
|
-
"DEBUG": "cyan",
|
|
135
|
-
"INFO": "green",
|
|
136
|
-
"WARNING": "yellow",
|
|
137
|
-
"ERROR": "red",
|
|
138
|
-
"CRITICAL": "red,bg_white",
|
|
139
|
-
},
|
|
140
|
-
secondary_log_colors={},
|
|
141
|
-
style="%",
|
|
142
|
-
)
|
|
147
|
+
Setup Python logging for HELM
|
|
143
148
|
|
|
149
|
+
Priority:
|
|
150
|
+
1. External config file (YAML or JSON).
|
|
151
|
+
2. ENV var LOG_LEVEL.
|
|
152
|
+
3. a default logger to STDOUT
|
|
153
|
+
"""
|
|
144
154
|
logger = logging.getLogger("helm")
|
|
145
|
-
logger.setLevel(logging.INFO)
|
|
146
155
|
logger.propagate = False
|
|
156
|
+
|
|
157
|
+
if config_path and os.path.exists(config_path):
|
|
158
|
+
with open(config_path, "r") as f:
|
|
159
|
+
config = yaml.safe_load(f)
|
|
160
|
+
logging.config.dictConfig(config)
|
|
161
|
+
hdebug("setup custom HELM logging")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
|
|
165
|
+
try:
|
|
166
|
+
logger.setLevel(getattr(logging, log_level))
|
|
167
|
+
except AttributeError:
|
|
168
|
+
logger.setLevel(logging.INFO)
|
|
169
|
+
|
|
170
|
+
# Set formatter
|
|
171
|
+
formatter: Optional[logging.Formatter] = None
|
|
172
|
+
if sys.stdout.isatty():
|
|
173
|
+
try:
|
|
174
|
+
formatter = ColoredFormatter(
|
|
175
|
+
"%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
|
|
176
|
+
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
177
|
+
reset=True,
|
|
178
|
+
log_colors={
|
|
179
|
+
"DEBUG": "cyan",
|
|
180
|
+
"INFO": "green",
|
|
181
|
+
"WARNING": "yellow",
|
|
182
|
+
"ERROR": "red",
|
|
183
|
+
"CRITICAL": "red,bg_white",
|
|
184
|
+
},
|
|
185
|
+
style="%",
|
|
186
|
+
)
|
|
187
|
+
except ImportError:
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
if formatter is None:
|
|
191
|
+
# fallback
|
|
192
|
+
formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
|
193
|
+
|
|
194
|
+
# Add default stdout handler
|
|
147
195
|
handler = logging.StreamHandler(sys.stdout)
|
|
148
196
|
handler.setFormatter(formatter)
|
|
149
197
|
logger.addHandler(handler)
|
|
198
|
+
hdebug("setup default HELM logging")
|
helm/common/object_spec.py
CHANGED
|
@@ -55,14 +55,23 @@ def inject_object_spec_args(
|
|
|
55
55
|
This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
58
|
+
>>> from helm.common.object_spec import * # NOQA
|
|
59
|
+
>>> import sys, types
|
|
60
|
+
>>> # Given a custom class with hashable arguments
|
|
61
|
+
>>> class MyClass:
|
|
62
|
+
... def __init__(a: int, b: int, c: int, d: int = 0):
|
|
63
|
+
... pass
|
|
64
|
+
>>> #
|
|
65
|
+
>>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
|
|
66
|
+
>>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
|
|
67
|
+
>>> # </boilerplate>
|
|
68
|
+
>>> #
|
|
69
|
+
>>> # Define new style and old style object specs
|
|
70
|
+
>>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
|
|
71
|
+
>>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
|
|
72
|
+
>>> # new_object_spec is now
|
|
73
|
+
>>> print(new_object_spec)
|
|
74
|
+
ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
|
|
66
75
|
"""
|
|
67
76
|
cls = get_class_by_name(spec.class_name)
|
|
68
77
|
init_signature = inspect.signature(cls.__init__)
|
|
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
|
|
|
93
102
|
<class_name>:<key>=<value>,<key>=<value>
|
|
94
103
|
Usually, the description is something that's succinct and can be typed on the command-line.
|
|
95
104
|
Here, value defaults to string.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> from helm.common.object_spec import * # NOQA
|
|
108
|
+
>>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
|
|
109
|
+
>>> parse_object_spec(description)
|
|
110
|
+
ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
|
|
96
111
|
"""
|
|
97
112
|
|
|
98
113
|
def parse_arg(arg: str) -> Tuple[str, Any]:
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import tempfile
|
|
3
|
+
import textwrap
|
|
4
|
+
import pathlib
|
|
5
|
+
from helm.benchmark import run
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArgvContext:
|
|
10
|
+
"""
|
|
11
|
+
Helper to assign a temporary value to sys.argv and then restore it
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, argv: Optional[List[str]]):
|
|
15
|
+
self.argv = argv
|
|
16
|
+
self._original_argv: Optional[List[str]] = None
|
|
17
|
+
|
|
18
|
+
def __enter__(self):
|
|
19
|
+
self._original_argv = sys.argv[:]
|
|
20
|
+
sys.argv = self.argv or []
|
|
21
|
+
|
|
22
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
23
|
+
assert self._original_argv is not None # Satisfies mypy
|
|
24
|
+
sys.argv = self._original_argv
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_run_with_custom_logging_config():
|
|
28
|
+
# Setup temporary directory
|
|
29
|
+
with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
|
|
30
|
+
tmp_dir = pathlib.Path(tmp_dir_str)
|
|
31
|
+
log_path = tmp_dir / "test.log"
|
|
32
|
+
log_config_path = tmp_dir / "test_config.yaml"
|
|
33
|
+
|
|
34
|
+
# Write custom YAML log config to file
|
|
35
|
+
log_config_text = textwrap.dedent(
|
|
36
|
+
f"""
|
|
37
|
+
version: 1
|
|
38
|
+
disable_existing_loggers: false
|
|
39
|
+
formatters:
|
|
40
|
+
simple:
|
|
41
|
+
datefmt: '%Y-%m-%dT%H:%M:%S'
|
|
42
|
+
format: '%(asctime)s %(levelname)s %(name)s %(message)s'
|
|
43
|
+
handlers:
|
|
44
|
+
file:
|
|
45
|
+
class: logging.FileHandler
|
|
46
|
+
filename: {log_path}
|
|
47
|
+
formatter: simple
|
|
48
|
+
level: DEBUG
|
|
49
|
+
mode: w
|
|
50
|
+
loggers:
|
|
51
|
+
helm:
|
|
52
|
+
handlers:
|
|
53
|
+
- file
|
|
54
|
+
level: DEBUG
|
|
55
|
+
propagate: false
|
|
56
|
+
"""
|
|
57
|
+
).strip()
|
|
58
|
+
|
|
59
|
+
log_config_path.write_text(log_config_text)
|
|
60
|
+
|
|
61
|
+
# Simulate command-line arguments
|
|
62
|
+
argv = [
|
|
63
|
+
"run.py", # Fake script name
|
|
64
|
+
"--run-entries",
|
|
65
|
+
"mmlu:subject=philosophy,model=openai/gpt2",
|
|
66
|
+
"-m",
|
|
67
|
+
"1",
|
|
68
|
+
"--suite",
|
|
69
|
+
"my-suite",
|
|
70
|
+
"--dry-run",
|
|
71
|
+
"--log-config",
|
|
72
|
+
str(log_config_path),
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Call main
|
|
76
|
+
with ArgvContext(argv):
|
|
77
|
+
run.main()
|
|
78
|
+
|
|
79
|
+
# Check log file contents
|
|
80
|
+
assert log_path.exists(), "Log file was not created"
|
|
81
|
+
log_contents = log_path.read_text()
|
|
82
|
+
|
|
83
|
+
# Test that log file was written to disk as requested
|
|
84
|
+
print("Log Contents")
|
|
85
|
+
print("------------")
|
|
86
|
+
print(log_contents)
|
|
87
|
+
|
|
88
|
+
assert (
|
|
89
|
+
"mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
|
|
90
|
+
), "Expected log content not found in log file:\n"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
test_run_with_custom_logging_config()
|