PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

helm/benchmark/model_metadata_registry.py CHANGED Viewed

@@ -22,11 +22,17 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 # OpenAI Chat format
 OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
+# Mistral instruction-following format
+MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
 # For Anthropic models
 ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
 ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
+ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
 GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
+GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
+GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
 # Models which emit garbage tokens when temperature=0.
 BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
@@ -46,12 +52,27 @@ NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG"
 # Some models can follow instructions.
 INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG"
+# For text-to-image models
+TEXT_TO_IMAGE_MODEL_TAG: str = "TEXT_TO_IMAGE_MODEL_TAG"
 # For Vision-langauge models (VLMs)
 VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
+# IDEFICS require a special prompt format (see `IDEFICSInstructRunExpander`)
+IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG"
+IDEFICS_MODEL_TAG: str = "IDEFICS_MODEL_TAG"
+# Llava should use a special prompt format (see `LlavaRunExpander`)
+LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"
+# OpenFlamingo has a special prompt format (see `OpenFlamingoRunExpander`)
+OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
+# Some VLMs do not support multiple images in the prompt
+LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
+FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.
 @dataclass(frozen=False)
 class ModelMetadata:
     name: str
@@ -153,6 +174,11 @@ def get_model_names_with_tag(tag: str) -> List[str]:
     return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
+def model_has_tag(model_name: str, tag: str) -> bool:
+    """Return True if the model has the given tag. False otherwise."""
+    return tag in get_model_metadata(model_name).tags
 def get_all_text_models() -> List[str]:
     """Return all model names of text models."""
     return get_model_names_with_tag(TEXT_MODEL_TAG)
@@ -168,6 +194,16 @@ def get_all_instruction_following_models() -> List[str]:
     return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
+def is_text_to_image_model(model_name: str) -> bool:
+    """Returns True if the model is a text-to-image model. False otherwise."""
+    return model_has_tag(model_name, TEXT_TO_IMAGE_MODEL_TAG)
+def is_vlm(model_name: str) -> bool:
+    """Returns True if the model is a vision-language model (VLM). False otherwise."""
+    return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
 def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
     """Return placeholder ModelMetadata for an unknown model."""
     return ModelMetadata(

helm/benchmark/multi_gpu_runner.py ADDED Viewed

@@ -0,0 +1,133 @@
+import signal
+import threading
+import traceback
+from typing import List
+import os
+import time
+import torch
+import torch.multiprocessing as multiprocessing
+from concurrent.futures import ProcessPoolExecutor as Pool
+from tqdm import tqdm
+from helm.benchmark.config_registry import (
+    register_configs_from_directory,
+    register_builtin_configs_from_helm_package,
+)
+from helm.benchmark.executor import ExecutionSpec
+from helm.benchmark.runner import Runner, RunSpec, RunnerError
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.benchmark.runner_config_registry import RUNNER_CONFIG
+_MAX_CONCURRENT_WORKERS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKERS"
+# From
+# https://stackoverflow.com/questions/71300294/how-to-terminate-pythons-processpoolexecutor-when-parent-process-dies
+def start_thread_to_terminate_when_parent_process_dies(ppid):
+    pid = os.getpid()
+    def f():
+        while True:
+            try:
+                os.kill(ppid, 0)
+            except OSError:
+                os.kill(pid, signal.SIGTERM)
+            time.sleep(1)
+    thread = threading.Thread(target=f, daemon=True)
+    thread.start()
+def initialize_worker(gpu_id: int):
+    hlog(f"Worker {gpu_id} initializing")
+    # Wait for 0.1 seconds to ensure all workers are initialized with different CUDA_VISIBLE_DEVICES
+    time.sleep(0.1)
+    # Pin GPU to worker process
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    # Necessary for code_metrics in humaneval to work properly
+    multiprocessing.set_start_method("fork", force=True)
+class MultiGPURunner(Runner):
+    """Runner that runs the entire benchmark on multiple GPUs.
+    This is a thin wrapper around `Runner` that runs the entire benchmark on
+    multiple GPUs using `multiprocessing`.
+    Note that this runner will load multiple models into memory at the same
+    time if your running configuration specifies that, similar to the `Runner`
+    class. `SlurmRunner` on the other hand will load at most one model on a
+    GPU"""
+    def __init__(
+        self,
+        execution_spec: ExecutionSpec,
+        output_path: str,
+        suite: str,
+        skip_instances: bool,
+        cache_instances: bool,
+        cache_instances_only: bool,
+        skip_completed_runs: bool,
+        exit_on_error: bool,
+    ):
+        super().__init__(
+            execution_spec=execution_spec,
+            output_path=output_path,
+            suite=suite,
+            skip_instances=skip_instances,
+            cache_instances=cache_instances,
+            cache_instances_only=cache_instances_only,
+            skip_completed_runs=skip_completed_runs,
+            exit_on_error=exit_on_error,
+        )
+        # Configure max concurrent worker jobs from the environment variable.
+        env_max_concurrent_workers = os.getenv(_MAX_CONCURRENT_WORKERS_ENV_NAME)
+        self.max_concurrent_workers = (
+            int(env_max_concurrent_workers)
+            if env_max_concurrent_workers
+            else (
+                RUNNER_CONFIG.helm_max_concurrent_workers
+                if RUNNER_CONFIG.helm_max_concurrent_workers > 0
+                else torch.cuda.device_count()
+            )
+        )
+    def safe_run_one(self, run_spec: RunSpec):
+        register_builtin_configs_from_helm_package()
+        if self.executor.execution_spec.local_path is not None:
+            register_configs_from_directory(self.executor.execution_spec.local_path)
+        try:
+            with htrack_block(f"Running {run_spec.name}"):
+                self.run_one(run_spec)
+        except Exception as e:
+            hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
+            return e
+    def run_all(self, run_specs: List[RunSpec]):
+        """Run the entire benchmark on multiple GPU"""
+        # Set the start method to forkserver to avoid issues with CUDA.
+        multiprocessing.set_start_method("forkserver")
+        with Pool(
+            max_workers=self.max_concurrent_workers,
+            initializer=start_thread_to_terminate_when_parent_process_dies,
+            initargs=(os.getpid(),),
+        ) as pool:
+            # Pin GPUs to each worker process
+            pool.map(initialize_worker, [i for i in range(self.max_concurrent_workers)])
+            # Run all queued tasks
+            error_msgs = list(tqdm(pool.map(self.safe_run_one, run_specs), total=len(run_specs), disable=None))
+        # Raise exception for failed runs, if any.
+        failed_run_names = [
+            run_spec.name for error_msg, run_spec in zip(error_msgs, run_specs) if error_msg is not None
+        ]
+        if failed_run_names:
+            failed_runs_str = ", ".join([f'"{run_name}"' for run_name in failed_run_names])
+            raise RunnerError(f"Failed runs: [{failed_runs_str}]")

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -10,9 +10,10 @@ from typing import List, Dict, Optional, Any, Callable, Union, Mapping, Tuple, S
 import numpy as np
 from scipy.stats import pearsonr
+from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
 from helm.common.hierarchical_logger import hlog
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME
+from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
 from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
 try:
@@ -133,9 +134,6 @@ class Plotter:
         self.plot_format = plot_format
         self._tables_cache: Dict[str, Dict[str, Table]] = {}
-        schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
-        self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
     def get_group_tables(self, group_name: str) -> Dict[str, Table]:
         """Reads and parses group tables. Uses _tables_cache to avoid reprocessing the same table multiple times."""
         if group_name in self._tables_cache:
@@ -338,14 +336,14 @@ class Plotter:
         def get_model_release_date(model_name: str) -> Optional[date]:
             """Maps a model name to the month of model release."""
-            release_date = self.model_metadata[model_name].release_date
+            release_date = MODEL_NAME_TO_MODEL_METADATA[model_name].release_date
             if release_date is None:
                 return None
             return release_date.replace(day=1)
         def get_model_size(model_name: str) -> Optional[int]:
             """Maps a model name to the number of parameters, rounding to the nearest leading digit."""
-            size = self.model_metadata[model_name].num_parameters
+            size = MODEL_NAME_TO_MODEL_METADATA[model_name].num_parameters
             if size is None:
                 return None
             grain = 10 ** (len(str(size)) - 1)
@@ -401,7 +399,9 @@ class Plotter:
         for i, access_level in enumerate(access_levels):
             model_indices: List[int] = [
-                idx for idx, model in enumerate(table.adapters) if self.model_metadata[model].access == access_level
+                idx
+                for idx, model in enumerate(table.adapters)
+                if MODEL_NAME_TO_MODEL_METADATA[model].access == access_level
             ]
             best_model_index = model_indices[table.mean_win_rates[model_indices].argmax()]
@@ -611,6 +611,7 @@ def main():
     parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
     parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
     args = parser.parse_args()
+    register_builtin_configs_from_helm_package()
     base_path = os.path.join(args.output_path, "runs", args.suite)
     if not os.path.exists(os.path.join(base_path, "groups")):
         hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 import os
-from typing import Dict, Iterable, List, Optional, Set, Tuple
+from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
-from helm.benchmark.adaptation.adapters.adapter_factory import (
+from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
 )
@@ -12,11 +12,13 @@ from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
 from helm.benchmark.metrics.metric import PerInstanceStats
+from helm.common.multimodal_request_utils import gather_generated_image_locations
 from helm.benchmark.presentation.schema import Schema
-from helm.benchmark.runner import RunSpec
+from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.scenarios.scenario import Instance
 from helm.common.general import write
 from helm.common.hierarchical_logger import hlog, htrack
+from helm.common.images_utils import encode_base64
 from helm.common.request import Request
 from helm.common.codec import from_json, to_json
@@ -43,6 +45,9 @@ class DisplayPrediction:
     truncated_predicted_text: Optional[str]
     """The truncated prediction text, if truncation is required by the Adapter method."""
+    base64_images: Optional[List[str]]
+    """Images in base64."""
     mapped_output: Optional[str]
     """The mapped output, if an output mapping exists and the prediction can be mapped"""
@@ -52,6 +57,8 @@ class DisplayPrediction:
     stats: Dict[str, float]
     """Statistics computed from the predicted output"""
+    annotations: Optional[Dict[str, Any]]
 @dataclass(frozen=True)
 class DisplayRequest:
@@ -73,7 +80,7 @@ class DisplayRequest:
     """The actual Request to display in the web frontend.
     There can be multiple requests per trial. The displayed request should be the
-    most relevant request e.g. the request for the chosen cohice for multiple choice questions."""
+    most relevant request e.g. the request for the chosen choice for multiple choice questions."""
 def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
@@ -102,8 +109,7 @@ def _truncate_predicted_text(
             tokens = request_state.result.completions[0].tokens
             if tokens:
                 first_token = tokens[0]
-                if not first_token.top_logprobs:
-                    prefix = first_token.text
+                prefix = first_token.text
     if prefix:
         predicted_text = predicted_text
         prefix = prefix
@@ -126,7 +132,7 @@ def _get_metric_names_for_group(run_group_name: str, schema: Schema) -> Set[str]
         if metric_group is None:
             continue
         for metric_name_matcher in metric_group.metrics:
-            if metric_name_matcher.perturbation_name:
+            if metric_name_matcher.perturbation_name and metric_name_matcher.perturbation_name != "__all__":
                 continue
             result.add(metric_name_matcher.substitute(run_group.environment).name)
     return result
@@ -256,9 +262,17 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
         mapped_output = (
             request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
         )
-        instance_id_to_instance[
-            (request_state.instance.id, request_state.instance.perturbation)
-        ] = request_state.instance
+        instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
+            request_state.instance
+        )
+        # Process images and include if they exist
+        images: List[str] = [
+            encode_base64(image_location)
+            for image_location in gather_generated_image_locations(request_state.result)
+            if os.path.exists(image_location)
+        ]
         predictions.append(
             DisplayPrediction(
                 instance_id=request_state.instance.id,
@@ -266,9 +280,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
                 train_trial_index=request_state.train_trial_index,
                 predicted_text=predicted_text,
                 truncated_predicted_text=_truncate_predicted_text(predicted_text, request_state, run_spec.adapter_spec),
+                base64_images=images,
                 mapped_output=mapped_output,
                 reference_index=request_state.reference_index,
                 stats=trial_stats,
+                annotations=request_state.annotations,
             )
         )
         requests.append(

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from datetime import date
 from typing import List, Optional, Dict
 import dacite
 import mako.template
@@ -46,34 +45,6 @@ class Field:
         return name
-# Note: also see Model from `models.py`.
-@dataclass(frozen=True)
-class ModelField(Field):
-    # Organization that originally created the model (e.g. "EleutherAI")
-    #   Note that this may be different from group or the prefix of the model `name`
-    #   ("together" in "together/gpt-j-6b") as the hosting organization
-    #   may be different from the creator organization. We also capitalize
-    #   this field properly to later display in the UI.
-    # TODO: in the future, we want to cleanup the naming in the following ways:
-    # - make the creator_organization an identifier with a separate display name
-    # - have a convention like <hosting_organization><creator_organization>/<model_name>
-    creator_organization: Optional[str] = None
-    # How this model is available (e.g., limited)
-    access: Optional[str] = None
-    # Whether we have yet to evaluate this model
-    todo: bool = False
-    # When was the model released
-    release_date: Optional[date] = None
-    # The number of parameters
-    # This should be a string as the number of parameters is usually a round number (175B),
-    # but we set it as an int for plotting purposes.
-    num_parameters: Optional[int] = None
 @dataclass(frozen=True)
 class MetricNameMatcher:
     """
@@ -100,7 +71,7 @@ class MetricNameMatcher:
         if self.name != metric_name.name:
             return False
-        if self.split != metric_name.split:
+        if self.split != "__all__" and self.split != metric_name.split:
             return False
         # Optional
@@ -121,9 +92,11 @@ class MetricNameMatcher:
         return MetricNameMatcher(
             name=mako.template.Template(self.name).render(**environment),
             split=mako.template.Template(self.split).render(**environment),
-            perturbation_name=mako.template.Template(self.perturbation_name).render(**environment)
-            if self.perturbation_name is not None
-            else None,
+            perturbation_name=(
+                mako.template.Template(self.perturbation_name).render(**environment)
+                if self.perturbation_name is not None
+                else None
+            ),
         )
@@ -135,6 +108,9 @@ class MetricGroup(Field):
     metrics: List[MetricNameMatcher] = field(default_factory=list)
+    hide_win_rates: Optional[bool] = None
+    """If set to true, do not compute win rates."""
 BY_METRIC = "by_metric"
 BY_GROUP = "by_group"
@@ -222,9 +198,6 @@ class RunGroup(Field):
 class Schema:
     """Specifies information about what to display on the frontend."""
-    # Models
-    models: List[ModelField]
     # Adapter fields (e.g., temperature)
     adapter: List[Field]
@@ -241,17 +214,19 @@ class Schema:
     run_groups: List[RunGroup]
     def __post_init__(self):
-        self.name_to_model = {model.name: model for model in self.models}
         self.name_to_metric = {metric.name: metric for metric in self.metrics}
         self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
         self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
         self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
-def read_schema(filename: str) -> Schema:
+def get_default_schema_path() -> str:
+    return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
+def read_schema(schema_path: str) -> Schema:
     # TODO: merge in model metadata from `model_metadata.yaml`
-    schema_path = resources.files(SCHEMA_YAML_PACKAGE).joinpath(filename)
     hlog(f"Reading schema file {schema_path}...")
-    with schema_path.open("r") as f:
+    with open(schema_path, "r") as f:
         raw = yaml.safe_load(f)
     return dacite.from_dict(Schema, raw)

crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl