PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -10,9 +10,10 @@ from typing import List, Dict, Optional, Any, Callable, Union, Mapping, Tuple, S
 import numpy as np
 from scipy.stats import pearsonr
+from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
 from helm.common.hierarchical_logger import hlog
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.benchmark.presentation.schema import read_schema
+from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
 from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
 try:
@@ -133,9 +134,6 @@ class Plotter:
         self.plot_format = plot_format
         self._tables_cache: Dict[str, Dict[str, Table]] = {}
-        schema = read_schema()
-        self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
     def get_group_tables(self, group_name: str) -> Dict[str, Table]:
         """Reads and parses group tables. Uses _tables_cache to avoid reprocessing the same table multiple times."""
         if group_name in self._tables_cache:
@@ -338,14 +336,14 @@ class Plotter:
         def get_model_release_date(model_name: str) -> Optional[date]:
             """Maps a model name to the month of model release."""
-            release_date = self.model_metadata[model_name].release_date
+            release_date = MODEL_NAME_TO_MODEL_METADATA[model_name].release_date
             if release_date is None:
                 return None
             return release_date.replace(day=1)
         def get_model_size(model_name: str) -> Optional[int]:
             """Maps a model name to the number of parameters, rounding to the nearest leading digit."""
-            size = self.model_metadata[model_name].num_parameters
+            size = MODEL_NAME_TO_MODEL_METADATA[model_name].num_parameters
             if size is None:
                 return None
             grain = 10 ** (len(str(size)) - 1)
@@ -401,7 +399,9 @@ class Plotter:
         for i, access_level in enumerate(access_levels):
             model_indices: List[int] = [
-                idx for idx, model in enumerate(table.adapters) if self.model_metadata[model].access == access_level
+                idx
+                for idx, model in enumerate(table.adapters)
+                if MODEL_NAME_TO_MODEL_METADATA[model].access == access_level
             ]
             best_model_index = model_indices[table.mean_win_rates[model_indices].argmax()]
@@ -611,6 +611,7 @@ def main():
     parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
     parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
     args = parser.parse_args()
+    register_builtin_configs_from_helm_package()
     base_path = os.path.join(args.output_path, "runs", args.suite)
     if not os.path.exists(os.path.join(base_path, "groups")):
         hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 import os
-from typing import Dict, Iterable, List, Optional, Set, Tuple
+from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
-from helm.benchmark.adaptation.adapters.adapter_factory import (
+from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
 )
@@ -12,11 +12,13 @@ from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
 from helm.benchmark.metrics.metric import PerInstanceStats
+from helm.common.multimodal_request_utils import gather_generated_image_locations
 from helm.benchmark.presentation.schema import Schema
-from helm.benchmark.runner import RunSpec
+from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.scenarios.scenario import Instance
 from helm.common.general import write
 from helm.common.hierarchical_logger import hlog, htrack
+from helm.common.images_utils import encode_base64
 from helm.common.request import Request
 from helm.common.codec import from_json, to_json
@@ -43,6 +45,9 @@ class DisplayPrediction:
     truncated_predicted_text: Optional[str]
     """The truncated prediction text, if truncation is required by the Adapter method."""
+    base64_images: Optional[List[str]]
+    """Images in base64."""
     mapped_output: Optional[str]
     """The mapped output, if an output mapping exists and the prediction can be mapped"""
@@ -52,6 +57,8 @@ class DisplayPrediction:
     stats: Dict[str, float]
     """Statistics computed from the predicted output"""
+    annotations: Optional[Dict[str, Any]]
 @dataclass(frozen=True)
 class DisplayRequest:
@@ -73,19 +80,17 @@ class DisplayRequest:
     """The actual Request to display in the web frontend.
     There can be multiple requests per trial. The displayed request should be the
-    most relevant request e.g. the request for the chosen cohice for multiple choice questions."""
+    most relevant request e.g. the request for the chosen choice for multiple choice questions."""
-def _read_scenario_state(run_path: str) -> ScenarioState:
-    scenario_state_path: str = os.path.join(run_path, "scenario_state.json")
+def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
     if not os.path.exists(scenario_state_path):
         raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
     with open(scenario_state_path) as f:
         return from_json(f.read(), ScenarioState)
-def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
-    per_instance_stats_path: str = os.path.join(run_path, "per_instance_stats.json")
+def _read_per_instance_stats(per_instance_stats_path: str) -> List[PerInstanceStats]:
     if not os.path.exists(per_instance_stats_path):
         raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
     with open(per_instance_stats_path) as f:
@@ -104,8 +109,7 @@ def _truncate_predicted_text(
             tokens = request_state.result.completions[0].tokens
             if tokens:
                 first_token = tokens[0]
-                if not first_token.top_logprobs:
-                    prefix = first_token.text
+                prefix = first_token.text
     if prefix:
         predicted_text = predicted_text
         prefix = prefix
@@ -128,7 +132,7 @@ def _get_metric_names_for_group(run_group_name: str, schema: Schema) -> Set[str]
         if metric_group is None:
             continue
         for metric_name_matcher in metric_group.metrics:
-            if metric_name_matcher.perturbation_name:
+            if metric_name_matcher.perturbation_name and metric_name_matcher.perturbation_name != "__all__":
                 continue
             result.add(metric_name_matcher.substitute(run_group.environment).name)
     return result
@@ -168,16 +172,35 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
     display_predictions_file_path = os.path.join(run_path, _DISPLAY_PREDICTIONS_JSON_FILE_NAME)
     display_requests_file_path = os.path.join(run_path, _DISPLAY_REQUESTS_JSON_FILE_NAME)
+    scenario_state_path = os.path.join(run_path, "scenario_state.json")
+    per_instance_stats_path = os.path.join(run_path, "per_instance_stats.json")
     if (
         skip_completed
         and os.path.exists(instances_file_path)
         and os.path.exists(display_predictions_file_path)
         and os.path.exists(display_requests_file_path)
     ):
-        hlog(f"Skipping writing display JSON for run {run_spec.name} because all output display JSON files exist.")
+        hlog(
+            f"Skipping writing display JSON for run {run_spec.name} "
+            "because all output display JSON files already exist."
+        )
+        return
+    elif not os.path.exists(scenario_state_path):
+        hlog(
+            f"Skipping writing display JSON for run {run_spec.name} because "
+            f"the scenario state JSON file does not exist at {scenario_state_path}"
+        )
         return
-    scenario_state = _read_scenario_state(run_path)
-    per_instance_stats = _read_per_instance_stats(run_path)
+    elif not os.path.exists(per_instance_stats_path):
+        hlog(
+            f"Skipping writing display JSON for run {run_spec.name} because "
+            f"the per instance stats JSON file does not exist at {per_instance_stats_path}"
+        )
+        return
+    scenario_state = _read_scenario_state(scenario_state_path)
+    per_instance_stats = _read_per_instance_stats(per_instance_stats_path)
     metric_names = _get_metric_names_for_groups(run_spec.groups, schema)
@@ -239,9 +262,17 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
         mapped_output = (
             request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
         )
-        instance_id_to_instance[
-            (request_state.instance.id, request_state.instance.perturbation)
-        ] = request_state.instance
+        instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
+            request_state.instance
+        )
+        # Process images and include if they exist
+        images: List[str] = [
+            encode_base64(image_location)
+            for image_location in gather_generated_image_locations(request_state.result)
+            if os.path.exists(image_location)
+        ]
         predictions.append(
             DisplayPrediction(
                 instance_id=request_state.instance.id,
@@ -249,9 +280,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
                 train_trial_index=request_state.train_trial_index,
                 predicted_text=predicted_text,
                 truncated_predicted_text=_truncate_predicted_text(predicted_text, request_state, run_spec.adapter_spec),
+                base64_images=images,
                 mapped_output=mapped_output,
                 reference_index=request_state.reference_index,
                 stats=trial_stats,
+                annotations=request_state.annotations,
             )
         )
         requests.append(

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from dataclasses import dataclass, field
-from datetime import date
 from typing import List, Optional, Dict
 import dacite
 import mako.template
-import yaml  # type: ignore
+import yaml
 import importlib_resources as resources
 from helm.common.general import hlog
@@ -11,8 +10,11 @@ from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
+# TODO: change to `helm.benchmark.config`
 SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
-SCHEMA_YAML_FILENAME: str = "schema.yaml"
+# TODO: add heim, vhelm, etc.
+SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
 @dataclass(frozen=True)
@@ -43,34 +45,6 @@ class Field:
         return name
-# Note: also see Model from `models.py`.
-@dataclass(frozen=True)
-class ModelField(Field):
-    # Organization that originally created the model (e.g. "EleutherAI")
-    #   Note that this may be different from group or the prefix of the model `name`
-    #   ("together" in "together/gpt-j-6b") as the hosting organization
-    #   may be different from the creator organization. We also capitalize
-    #   this field properly to later display in the UI.
-    # TODO: in the future, we want to cleanup the naming in the following ways:
-    # - make the creator_organization an identifier with a separate display name
-    # - have a convention like <hosting_organization><creator_organization>/<model_name>
-    creator_organization: Optional[str] = None
-    # How this model is available (e.g., limited)
-    access: Optional[str] = None
-    # Whether we have yet to evaluate this model
-    todo: bool = False
-    # When was the model released
-    release_date: Optional[date] = None
-    # The number of parameters
-    # This should be a string as the number of parameters is usually a round number (175B),
-    # but we set it as an int for plotting purposes.
-    num_parameters: Optional[int] = None
 @dataclass(frozen=True)
 class MetricNameMatcher:
     """
@@ -97,7 +71,7 @@ class MetricNameMatcher:
         if self.name != metric_name.name:
             return False
-        if self.split != metric_name.split:
+        if self.split != "__all__" and self.split != metric_name.split:
             return False
         # Optional
@@ -118,9 +92,11 @@ class MetricNameMatcher:
         return MetricNameMatcher(
             name=mako.template.Template(self.name).render(**environment),
             split=mako.template.Template(self.split).render(**environment),
-            perturbation_name=mako.template.Template(self.perturbation_name).render(**environment)
-            if self.perturbation_name is not None
-            else None,
+            perturbation_name=(
+                mako.template.Template(self.perturbation_name).render(**environment)
+                if self.perturbation_name is not None
+                else None
+            ),
         )
@@ -132,6 +108,9 @@ class MetricGroup(Field):
     metrics: List[MetricNameMatcher] = field(default_factory=list)
+    hide_win_rates: Optional[bool] = None
+    """If set to true, do not compute win rates."""
 BY_METRIC = "by_metric"
 BY_GROUP = "by_group"
@@ -207,18 +186,18 @@ class RunGroup(Field):
     # Which adapter_spec fields we should preserve when displaying methods for this group
     # When we are constructing a table where the rows are methods, what constitutes a "method" is given by the set of
-    # adapter keys. By default, this should just be "model" (e.g., BLOOM), where details like "num_train_instances" are
-    # "marginalized out". However, for ablations, we want to include both "model" and "num_train_instances".
-    adapter_keys_shown: List[str] = field(default_factory=lambda: ["model"])
+    # adapter keys. By default, this should just be "model_deployment" (e.g., BLOOM), where details like
+    # "num_train_instances" are "marginalized out". However, for ablations, we want to include both "model_deployment"
+    # and "num_train_instances".
+    # NOTE: "model" is kept for backward compatibility reason.
+    # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
+    adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
 @dataclass
 class Schema:
     """Specifies information about what to display on the frontend."""
-    # Models
-    models: List[ModelField]
     # Adapter fields (e.g., temperature)
     adapter: List[Field]
@@ -235,16 +214,19 @@ class Schema:
     run_groups: List[RunGroup]
     def __post_init__(self):
-        self.name_to_model = {model.name: model for model in self.models}
         self.name_to_metric = {metric.name: metric for metric in self.metrics}
         self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
         self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
         self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
-def read_schema() -> Schema:
-    hlog(f"Reading schema from {SCHEMA_YAML_FILENAME}...")
-    schema_path = resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_YAML_FILENAME)
-    with schema_path.open("r") as f:
+def get_default_schema_path() -> str:
+    return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
+def read_schema(schema_path: str) -> Schema:
+    # TODO: merge in model metadata from `model_metadata.yaml`
+    hlog(f"Reading schema file {schema_path}...")
+    with open(schema_path, "r") as f:
         raw = yaml.safe_load(f)
     return dacite.from_dict(Schema, raw)

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl