PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +1 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +5 -0
helm/benchmark/presentation/summarize.py +9 -3
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +19 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/index.html +1 -1
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/huggingface_client.py +2 -2
helm/clients/openai_client.py +2 -1
helm/clients/openai_responses_client.py +6 -4
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +0 -2
helm/clients/vertexai_client.py +11 -9
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +454 -175
helm/config/model_metadata.yaml +117 -10
helm/config/tokenizer_configs.yaml +81 -1
helm/proxy/cli.py +1 -1
helm/proxy/retry.py +5 -0
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0

helm/clients/vllm_client.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
 from helm.common.cache import CacheConfig
 from helm.common.request import Request
-from helm.clients.openai_client import OpenAILegacyCompletionsClient
+from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
 from helm.tokenizers.tokenizer import Tokenizer
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
         tokenizer_name: str,
         cache_config: CacheConfig,
         base_url: Optional[str] = None,
+        vllm_model_name: Optional[str] = None,
+        **kwargs,
     ):
         super().__init__(
             tokenizer=tokenizer,
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
             api_key="EMPTY",
             org_id=None,
             base_url=base_url,
+            openai_model_name=vllm_model_name,
+            **kwargs,
         )
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
-    def _get_model_for_request(self, request: Request) -> str:
-        # The `model` parameter for vLLM should be the whole model name including the creator organization,
-        # unlike OpenAI which only uses the model engine.
-        return request.model
+        self.vllm_model_name = vllm_model_name
     def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
         raw_request = super()._to_raw_completion_request(request)
         # This avoids the error: best_of must be 1 when using greedy sampling
-        if "best_of" in raw_request and raw_request["best_of"] > 1:
+        if (
+            "temperature" in raw_request
+            and raw_request["temperature"] == 0.0
+            and "best_of" in raw_request
+            and raw_request["best_of"] > 1
+        ):
             raw_request["best_of"] = 1
         return raw_request
+class VLLMChatClient(OpenAIClient):
+    """Sends request to a vLLM server using the OpenAI-compatible API.
+    Only uses the Chat Completions API.
+    See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        base_url: Optional[str] = None,
+        vllm_model_name: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key="EMPTY",
+            org_id=None,
+            base_url=base_url,
+            openai_model_name=vllm_model_name,
+            **kwargs,
+        )
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self.vllm_model_name = vllm_model_name

helm/clients/vllm_granite_thinking_client.py ADDED Viewed

@@ -0,0 +1,56 @@
+from dataclasses import replace
+import re
+from typing import Any, Dict, List, Tuple
+from helm.clients.vllm_client import VLLMChatClient
+from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
+class VLLMGraniteThinkingClient(VLLMChatClient):
+    """Sends request to a Granite model on vLLM server with thinking enabled.
+    From vLLM documentation at
+    https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
+    IBM Granite 3.2 reasoning is disabled by default;
+    to enable it, you must also pass thinking=True in your chat_template_kwargs.
+    """
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
+        raw_request = super()._make_chat_raw_request(request)
+        raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
+        return raw_request
+    def _parse_thinking(self, input: str) -> Tuple[str, str]:
+        """Return a tuple of thinking text and output text."""
+        match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
+        if match:
+            return (match.group(1), match.group(2))
+        match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
+        if match:
+            return (match.group(1), match.group(2))
+        match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
+        if match:
+            return (match.group(1), "")
+        match = re.match(r"<think>(.*)", input, re.DOTALL)
+        if match:
+            return (match.group(1), "")
+        return (input, "")
+    def _make_chat_request(self, request: Request) -> RequestResult:
+        request_result = super()._make_chat_request(request)
+        modified_completions: List[GeneratedOutput] = []
+        for completion in request_result.completions:
+            thinking, modified_text = self._parse_thinking(completion.text)
+            modified_completions.append(
+                replace(
+                    completion,
+                    text=modified_text,
+                    thinking=Thinking(text=thinking),
+                )
+            )
+        return replace(request_result, completions=modified_completions)

helm/common/critique_request.py CHANGED Viewed

@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
 class QuestionType:
     """String enum of question types."""
-    # TODO: Make this a StrEnum after upgrading to Python 3.11
     MULTIPLE_CHOICE: str = "multiple_choice"
     CHECKBOX: str = "checkbox"
     FREE_RESPONSE: str = "free_response"

helm/common/hierarchical_logger.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import logging
+import logging.config
+import yaml
+import os
 import sys
 import time
 from typing import Any, Callable, List, Optional
@@ -34,22 +37,31 @@ class HierarchicalLogger(object):
     def indent(self) -> str:
         return "  " * len(self.start_times)
-    def track_begin(self, x: Any) -> None:
-        self.logger.info(self.indent() + str(x) + " {")
+    def track_begin(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.info(self.indent() + str(x) + " {", **kwargs)
         sys.stdout.flush()
         self.start_times.append(time.time())
-    def track_end(self) -> None:
+    def track_end(self, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
         t = time.time() - self.start_times.pop()
-        self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
+        self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
         sys.stdout.flush()
-    def log(self, x: Any) -> None:
-        self.logger.info(self.indent() + str(x))
+    def log(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.info(self.indent() + str(x), **kwargs)
         sys.stdout.flush()
-    def warn(self, x: Any) -> None:
-        self.logger.warning(self.indent() + str(x))
+    def debug(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.debug(self.indent() + str(x), **kwargs)
+        sys.stdout.flush()
+    def warn(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.warning(self.indent() + str(x), **kwargs)
         sys.stdout.flush()
@@ -69,23 +81,31 @@ singleton = HierarchicalLogger()
 # Exposed public methods
-def hlog(x: Any) -> None:
-    singleton.log(x)
+def hdebug(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.debug(x, **kwargs)
+def hlog(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.log(x, **kwargs)
-def hwarn(x: Any) -> None:
-    singleton.warn(x)
+def hwarn(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.warn(x, **kwargs)
 class htrack_block:
-    def __init__(self, x: Any) -> None:
+    def __init__(self, x: Any, stacklevel=1) -> None:
+        self._stacklevel = stacklevel + 1
         self.x = x
     def __enter__(self) -> None:
-        singleton.track_begin(self.x)
+        singleton.track_begin(self.x, stacklevel=self._stacklevel)
     def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
-        singleton.track_end()
+        singleton.track_end(stacklevel=self._stacklevel)
 class htrack:
@@ -116,34 +136,63 @@ class htrack:
                     description = description.replace("$" + k, str(v))
             else:
                 description = ""
-            with htrack_block(parent + fn.__name__ + description):
+            with htrack_block(parent + fn.__name__ + description, stacklevel=2):
                 return fn(*args, **kwargs)
         return wrapper
-def setup_default_logging():
+def setup_default_logging(config_path: Optional[str] = None):
     """
-    Setup a default logger to STDOUT for HELM via Python logging
-    """
-    formatter = ColoredFormatter(
-        "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
-        datefmt="%Y-%m-%dT%H:%M:%S",
-        reset=True,
-        log_colors={
-            "DEBUG": "cyan",
-            "INFO": "green",
-            "WARNING": "yellow",
-            "ERROR": "red",
-            "CRITICAL": "red,bg_white",
-        },
-        secondary_log_colors={},
-        style="%",
-    )
+    Setup Python logging for HELM
+    Priority:
+    1. External config file (YAML or JSON).
+    2. ENV var LOG_LEVEL.
+    3. a default logger to STDOUT
+    """
     logger = logging.getLogger("helm")
-    logger.setLevel(logging.INFO)
     logger.propagate = False
+    if config_path and os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        logging.config.dictConfig(config)
+        hdebug("setup custom HELM logging")
+        return
+    log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
+    try:
+        logger.setLevel(getattr(logging, log_level))
+    except AttributeError:
+        logger.setLevel(logging.INFO)
+    # Set formatter
+    formatter: Optional[logging.Formatter] = None
+    if sys.stdout.isatty():
+        try:
+            formatter = ColoredFormatter(
+                "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
+                datefmt="%Y-%m-%dT%H:%M:%S",
+                reset=True,
+                log_colors={
+                    "DEBUG": "cyan",
+                    "INFO": "green",
+                    "WARNING": "yellow",
+                    "ERROR": "red",
+                    "CRITICAL": "red,bg_white",
+                },
+                style="%",
+            )
+        except ImportError:
+            pass
+    if formatter is None:
+        # fallback
+        formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    # Add default stdout handler
     handler = logging.StreamHandler(sys.stdout)
     handler.setFormatter(formatter)
     logger.addHandler(handler)
+    hdebug("setup default HELM logging")

helm/common/object_spec.py CHANGED Viewed

@@ -55,14 +55,23 @@ def inject_object_spec_args(
     This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
     Example:
-    class MyClass:
-        def __init__(a: int, b: int, c: int, d: int = 0):
-            pass
-    old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
-    new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
-    # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
+        >>> from helm.common.object_spec import *  # NOQA
+        >>> import sys, types
+        >>> # Given a custom class with hashable arguments
+        >>> class MyClass:
+        ...     def __init__(a: int, b: int, c: int, d: int = 0):
+        ...         pass
+        >>> #
+        >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
+        >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
+        >>> # </boilerplate>
+        >>> #
+        >>> # Define new style and old style object specs
+        >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
+        >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
+        >>> # new_object_spec is now
+        >>> print(new_object_spec)
+        ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
     """
     cls = get_class_by_name(spec.class_name)
     init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
         <class_name>:<key>=<value>,<key>=<value>
     Usually, the description is something that's succinct and can be typed on the command-line.
     Here, value defaults to string.
+    Example:
+        >>> from helm.common.object_spec import *  # NOQA
+        >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
+        >>> parse_object_spec(description)
+        ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
     """
     def parse_arg(arg: str) -> Tuple[str, Any]:

helm/common/test_logging.py ADDED Viewed

@@ -0,0 +1,94 @@
+import sys
+import tempfile
+import textwrap
+import pathlib
+from helm.benchmark import run
+from typing import List, Optional
+class ArgvContext:
+    """
+    Helper to assign a temporary value to sys.argv and then restore it
+    """
+    def __init__(self, argv: Optional[List[str]]):
+        self.argv = argv
+        self._original_argv: Optional[List[str]] = None
+    def __enter__(self):
+        self._original_argv = sys.argv[:]
+        sys.argv = self.argv or []
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self._original_argv is not None  # Satisfies mypy
+        sys.argv = self._original_argv
+def test_run_with_custom_logging_config():
+    # Setup temporary directory
+    with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
+        tmp_dir = pathlib.Path(tmp_dir_str)
+        log_path = tmp_dir / "test.log"
+        log_config_path = tmp_dir / "test_config.yaml"
+        # Write custom YAML log config to file
+        log_config_text = textwrap.dedent(
+            f"""
+            version: 1
+            disable_existing_loggers: false
+            formatters:
+              simple:
+                datefmt: '%Y-%m-%dT%H:%M:%S'
+                format: '%(asctime)s %(levelname)s %(name)s %(message)s'
+            handlers:
+              file:
+                class: logging.FileHandler
+                filename: {log_path}
+                formatter: simple
+                level: DEBUG
+                mode: w
+            loggers:
+              helm:
+                handlers:
+                - file
+                level: DEBUG
+                propagate: false
+            """
+        ).strip()
+        log_config_path.write_text(log_config_text)
+        # Simulate command-line arguments
+        argv = [
+            "run.py",  # Fake script name
+            "--run-entries",
+            "mmlu:subject=philosophy,model=openai/gpt2",
+            "-m",
+            "1",
+            "--suite",
+            "my-suite",
+            "--dry-run",
+            "--log-config",
+            str(log_config_path),
+        ]
+        # Call main
+        with ArgvContext(argv):
+            run.main()
+        # Check log file contents
+        assert log_path.exists(), "Log file was not created"
+        log_contents = log_path.read_text()
+        # Test that log file was written to disk as requested
+        print("Log Contents")
+        print("------------")
+        print(log_contents)
+        assert (
+            "mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
+        ), "Expected log content not found in log file:\n"
+if __name__ == "__main__":
+    test_run_with_custom_logging_config()

crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.7py3-none-any.whl