PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/metrics/tokens/auto_token_cost_estimator.py CHANGED Viewed

@@ -2,12 +2,12 @@ from typing import Dict
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .ai21_token_cost_estimator import AI21TokenCostEstimator
-from .cohere_token_cost_estimator import CohereTokenCostEstimator
-from .free_token_cost_estimator import FreeTokenCostEstimator
-from .gooseai_token_cost_estimator import GooseAITokenCostEstimator
-from .openai_token_cost_estimator import OpenAITokenCostEstimator
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
+from helm.benchmark.metrics.tokens.cohere_token_cost_estimator import CohereTokenCostEstimator
+from helm.benchmark.metrics.tokens.free_token_cost_estimator import FreeTokenCostEstimator
+from helm.benchmark.metrics.tokens.gooseai_token_cost_estimator import GooseAITokenCostEstimator
+from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class AutoTokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class CohereTokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/free_token_cost_estimator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class FreeTokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py CHANGED Viewed

@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 from helm.common.request import Request
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class GooseAITokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/openai_token_cost_estimator.py CHANGED Viewed

@@ -2,7 +2,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.request import Request
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 from helm.benchmark.window_services.window_service import WindowService
-from .token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 class OpenAITokenCostEstimator(TokenCostEstimator):

helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from unittest.mock import Mock
 from helm.common.request import Request
-from .ai21_token_cost_estimator import AI21TokenCostEstimator
+from helm.benchmark.metrics.tokens.ai21_token_cost_estimator import AI21TokenCostEstimator
 class TestAI21TokenCostEstimator:

helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py CHANGED Viewed

@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
 from helm.common.request import Request
 from helm.common.tokenization_request import TokenizationRequestResult, TokenizationToken
 from helm.proxy.services.remote_service import RemoteService
-from .openai_token_cost_estimator import OpenAITokenCostEstimator
+from helm.benchmark.metrics.tokens.openai_token_cost_estimator import OpenAITokenCostEstimator
 class TestOpenAITokenCostEstimator:

helm/benchmark/metrics/toxicity_metrics.py CHANGED Viewed

@@ -6,10 +6,10 @@ from helm.common.hierarchical_logger import hlog
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 class ToxicityMetric(Metric):

helm/benchmark/metrics/unitxt_metrics.py CHANGED Viewed

@@ -18,7 +18,10 @@ class UnitxtMetric(MetricInterface):
     def __init__(self, **kwargs):
         super().__init__()
-        dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
+        if len(kwargs) == 1 and "recipe" in kwargs:
+            dataset_name = kwargs["recipe"]
+        else:
+            dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
         self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
     def evaluate(

helm/benchmark/metrics/vision_language/image_metrics.py CHANGED Viewed

@@ -324,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
         # Compute the LPIPS score
         assert self._lpips_metric is not None
-        score: float = self._lpips_metric(img1, img2).detach().item()
+        score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
         return score
     def _calculate_fid(self, act1, act2):

helm/benchmark/metrics/wildbench_metrics.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, Dict, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class WildBenchScoreMetric(Metric):
+    """Score metrics for WildBench."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotations: Dict[str, Any] = request_state.annotations["wildbench"]
+        scores: List[float] = []
+        for annotation_key, annotation_value in annotations.items():
+            if annotation_key.endswith("_score") and annotation_value is not None:
+                scores.append(annotation_value)
+        if not scores:
+            raise ValueError("Could not compute WB Score because all annotators failed.")
+        score = sum(scores) / len(scores)
+        score_rescaled = (score - 1) / 9
+        return [
+            Stat(MetricName("wildbench_score")).add(score),
+            Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
+        ]

helm/benchmark/model_metadata_registry.py CHANGED Viewed

@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 # OpenAI Chat format
 OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
+# For NOVA models
+NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
 # For Anthropic models
 ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
 ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
 LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
 FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
+# For Audio-langauge models (AudioLMs)
+AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
 # Deprecated models that are no longer available.
 # These are usually closed API models that have been permanently removed
 DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
+# Unsupported models.
+# These are models that we have chosen not to support because they are
+# private, stale, non-notable, or difficult to implement.
+UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
     return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
+def is_audiolm(model_name: str) -> bool:
+    """Returns True if the model is a audio-language model (AudioLM). False otherwise."""
+    return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
 def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
     """Return placeholder ModelMetadata for an unknown model."""
     return ModelMetadata(

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -52,6 +52,10 @@ from helm.benchmark.presentation.schema import (
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
 from helm.benchmark.presentation.run_display import write_run_display_json
 from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
+from helm.common.object_spec import get_class_by_name
+MODEL_HEADER_CELL_VALUE = "Model"
 @dataclass(frozen=True)
@@ -262,18 +266,22 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
     """
     row_means: List[Optional[float]] = []
+    # if the first column contains the names of models, do not treat it like a value column
+    skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
     # check for all header cells where specified, that lower_is_better is consistent
     orderings = []
-    for elem in table.header:
-        orderings.append(elem.lower_is_better)
+    header_cells = table.header[1:] if skip_first_column else table.header
+    for header_cell in header_cells:
+        orderings.append(header_cell.lower_is_better)
     if len(set(orderings)) != 1:
         raise Exception("Cannot mean columns with different values for lower_is_better")
     for row in table.rows:
         total = 0.0
         count = 0
-        for cell in row:
+        row_cells = row[1:] if skip_first_column else row
+        for cell in row_cells:
             if cell.value is not None:
                 total += float(cell.value)
                 count += 1
@@ -811,7 +819,7 @@ class Summarizer:
         num_groups = len(set(run_group.name for run_group, _ in columns))  # number of unique groups, determines headers
         # Column headers
-        header.append(HeaderCell("Model/adapter"))
+        header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
         for run_group, metric_group_name in columns:
             # check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
             if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
@@ -969,22 +977,20 @@ class Summarizer:
             if strategy == AggregationStrategy.WIN_RATE:
                 WIN_RATE_AGGREGATION = "mean"
                 win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-                description = "How many models this model outperforms on average (over columns)."
                 aggregate_header_cells.append(
                     HeaderCell(
                         f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
-                        description=description,
+                        description="How many models this model outperforms on average (over columns).",
                         lower_is_better=False,
                     )
                 )
                 aggregate_row_values.append(win_rates)
             elif strategy == AggregationStrategy.MEAN:
                 means = compute_aggregate_row_means(table)
-                description = "An average over columns representing the mean performance."
                 aggregate_header_cells.append(
                     HeaderCell(
-                        "Mean performance",
-                        description=description,
+                        "Mean score",
+                        description="The mean of the scores from all columns.",
                         lower_is_better=table.header[0].lower_is_better,
                     )
                 )
@@ -1272,6 +1278,12 @@ def main():
         help="Whether to allow unknown models in the metadata file",
         default=True,
     )
+    parser.add_argument(
+        "--summarizer-class-name",
+        type=str,
+        default=None,
+        help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
+    )
     args = parser.parse_args()
     release: Optional[str] = None
@@ -1301,7 +1313,8 @@ def main():
     register_configs_from_directory(args.local_path)
     # Output JSON files summarizing the benchmark results which will be loaded in the web interface
-    summarizer = Summarizer(
+    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
+    summarizer = summarizer_cls(
         release=release,
         suites=suites,
         suite=suite,

helm/benchmark/presentation/torr_robustness_summarizer.py ADDED Viewed

@@ -0,0 +1,178 @@
+import os
+from typing import Dict, List, Optional
+from helm.benchmark.metrics.metric import PerInstanceStats
+from helm.benchmark.presentation.schema import MetricNameMatcher, RunGroup
+from helm.benchmark.presentation.summarize import Run, Summarizer
+from helm.benchmark.presentation.table import Cell
+from helm.common.codec import from_json
+class ToRRRobustnessSummarizer(Summarizer):
+    """A Summarizer that computes robustness metrics.
+    This Summarizer computes a robustness metrics based on the definition in the ToRR paper.
+    The instance-level robustness score for a given model and instance is defined as
+    1 - (max_score - min_score) where max_score and min_scores are the maximum and minimum
+    scores for the model on that instance across all runs (i.e. across all augmentations
+    and serialization formats). The robustness score for a given model and scenario is
+    the mean of the model's instance-level robustness score across all instances in that scenario.
+    The core HELM framework does not natively support computing metrics that depend on
+    per-instance metrics across multiple runs, therefore this special summarizer is needed
+    to compute this robustness metic."""
+    def __init__(
+        self,
+        release: Optional[str],
+        suites: Optional[List[str]],
+        suite: Optional[str],
+        schema_path: str,
+        output_path: str,
+        verbose: bool,
+        num_threads: int,
+        allow_unknown_models: bool,
+    ):
+        super().__init__(
+            release,
+            suites,
+            suite,
+            schema_path,
+            output_path,
+            verbose,
+            num_threads,
+            allow_unknown_models,
+        )
+        self.run_group_to_model_name_to_robustness: Dict[str, Dict[str, float]] = {}
+    PERFORMANCE_METRIC_GROUP_NAME = "performance_metrics"
+    ROBUSTNESS_METRIC_GROUP_NAME = "robustness_metrics"
+    ROBUSTNESS_METRIC_NAME = "robustness"
+    def _get_instance_id_to_performance(
+        self, run: Run, performance_metric_matcher: MetricNameMatcher
+    ) -> Dict[str, float]:
+        with open(os.path.join(run.run_path, "per_instance_stats.json")) as f:
+            per_instance_stats = from_json(f.read(), List[PerInstanceStats])
+        instance_id_to_performance: Dict[str, float] = {}
+        for per_instance_stats_item in per_instance_stats:
+            assert per_instance_stats_item.train_trial_index == 0
+            assert per_instance_stats_item.perturbation is None
+            for stat in per_instance_stats_item.stats:
+                if performance_metric_matcher.matches(stat.name):
+                    assert per_instance_stats_item.instance_id not in instance_id_to_performance
+                    if stat.mean is not None:
+                        instance_id_to_performance[per_instance_stats_item.instance_id] = stat.mean
+        return instance_id_to_performance
+    def _compute_robustness_for_runs(self, runs: List[Run], performance_metric_matcher: MetricNameMatcher) -> float:
+        instance_id_to_performances: Dict[str, List[float]] = {}
+        for run in runs:
+            for instance_id, performance in self._get_instance_id_to_performance(
+                run, performance_metric_matcher
+            ).items():
+                if instance_id not in instance_id_to_performances:
+                    instance_id_to_performances[instance_id] = []
+                instance_id_to_performances[instance_id].append(performance)
+        instance_id_to_robustness: Dict[str, float] = {}
+        for instance_id, performances in instance_id_to_performances.items():
+            instance_id_to_robustness[instance_id] = 1 - (max(performances) - min(performances))
+        return sum(instance_id_to_robustness.values()) / len(instance_id_to_robustness.values())
+    def _compute_robustness_for_run_group(self, run_group: RunGroup) -> Dict[str, float]:
+        performance_metric_group = self.schema.name_to_metric_group[self.PERFORMANCE_METRIC_GROUP_NAME]
+        assert len(performance_metric_group.metrics) == 1
+        performance_metric_matcher = performance_metric_group.metrics[0].substitute(run_group.environment)
+        group_runs = [run for run in self.runs if run_group.name in run.run_spec.groups]
+        model_name_to_runs: Dict[str, List[Run]] = {}
+        for run in group_runs:
+            model_name = run.run_spec.adapter_spec.model
+            if model_name not in model_name_to_runs:
+                model_name_to_runs[model_name] = []
+            model_name_to_runs[run.run_spec.adapter_spec.model].append(run)
+        model_to_robustness: Dict[str, float] = {}
+        for model_name, model_runs in model_name_to_runs.items():
+            model_to_robustness[model_name] = self._compute_robustness_for_runs(model_runs, performance_metric_matcher)
+        return model_to_robustness
+    def write_groups(self):
+        for run_group in self.schema.run_groups:
+            if self.ROBUSTNESS_METRIC_GROUP_NAME and self.PERFORMANCE_METRIC_GROUP_NAME in run_group.metric_groups:
+                self.run_group_to_model_name_to_robustness[run_group.name] = self._compute_robustness_for_run_group(
+                    run_group
+                )
+        return super().write_groups()
+    def create_cell(
+        self,
+        runs: List[Run],
+        matcher: MetricNameMatcher,
+        additional_info: Optional[str],
+        hide_value: bool = False,
+        is_scenario_table: bool = False,
+    ) -> Cell:
+        """
+        Use the metric name identified by `matcher` to pull out the stats from
+        `runs` and return a representation of the average.
+        There are four cases:
+        1. No matching runs
+        2. Matching runs but no matching stats (maybe stat was named incorrectly)
+        3. Matching runs, matching stats, but stats have count = 0, so mean is undefined
+           (e.g., bias metric ran and computed 0/0)
+        4. Matching runs, matching stats, stats with count > 0
+        In the first three cases, the cell value is None, but the description distinguishes between these cases.
+        """
+        if matcher.name != self.ROBUSTNESS_METRIC_NAME:
+            return super().create_cell(runs, matcher, additional_info, hide_value, is_scenario_table)
+        if len(runs) == 0:
+            return Cell(value=None, description="No matching runs")
+        # Link the runs that this cell was aggregated from, if this is not a scenario table.
+        # Scenario tables link to the runs in the model cells,
+        # whereas non-scenario tables link to the runs in the metrics cells.
+        run_spec_names: Optional[List] = None
+        if not is_scenario_table:
+            # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
+            # run specs if a run spec belongs to multiple groups.
+            run_spec_names = []
+            run_spec_names_set = set()
+            for run in runs:
+                if run.run_spec.name not in run_spec_names_set:
+                    run_spec_names.append(run.run_spec.name)
+                    run_spec_names_set.add(run.run_spec.name)
+        run_group_set = set(runs[0].run_spec.groups) & set(self.run_group_to_model_name_to_robustness.keys())
+        assert len(run_group_set) == 1
+        run_group = next(iter(run_group_set))
+        model_names_set = set(run.run_spec.adapter_spec.model for run in runs)
+        assert len(model_names_set) == 1
+        model_name = next(iter(model_names_set))
+        value = (
+            self.run_group_to_model_name_to_robustness[run_group][model_name]
+            if self.run_group_to_model_name_to_robustness[run_group]
+            and self.run_group_to_model_name_to_robustness[run_group][model_name]
+            else None
+        )
+        description = str(round(value, 3)) if value is not None else ""
+        if hide_value:
+            value = None
+            description = ""
+        if additional_info:
+            description += "\n" + additional_info
+        if self.verbose:
+            description += "\n-- ".join(["\nRun specs:", *(run_spec_names or [])])
+        return Cell(
+            value=value,
+            description=description,
+            style={},
+            run_spec_names=run_spec_names,
+        )

helm/benchmark/reeval_run.py ADDED Viewed

@@ -0,0 +1,203 @@
+import argparse
+from dataclasses import replace
+import re
+from typing import List
+from helm.benchmark import model_metadata_registry
+from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
+from helm.common.general import ensure_directory_exists
+from helm.common.hierarchical_logger import hlog, htrack
+from helm.common.authentication import Authentication
+from helm.proxy.services.remote_service import create_authentication, add_service_args
+from helm.benchmark.config_registry import (
+    register_configs_from_directory,
+    register_builtin_configs_from_helm_package,
+)
+from helm.benchmark.runner import set_benchmark_output_path
+from helm.common.reeval_parameters import REEvalParameters
+from helm.benchmark.run import (
+    run_benchmarking,
+    validate_args,
+    add_run_args,
+    run_entries_to_run_specs,
+)
+@htrack(None)
+def main():
+    parser = argparse.ArgumentParser()
+    add_service_args(parser)
+    parser.add_argument(
+        "-c",
+        "--conf-paths",
+        nargs="+",
+        help="Where to read RunSpecs to run from",
+        default=[],
+    )
+    parser.add_argument(
+        "--models-to-run",
+        nargs="+",
+        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
+        default=None,
+    )
+    parser.add_argument(
+        "--groups-to-run",
+        nargs="+",
+        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
+        default=None,
+    )
+    parser.add_argument(
+        "--exit-on-error",
+        action="store_true",
+        help="Fail and exit immediately if a particular RunSpec fails.",
+    )
+    parser.add_argument(
+        "--skip-completed-runs",
+        action="store_true",
+        help="Skip RunSpecs that have completed i.e. output files exists.",
+    )
+    parser.add_argument(
+        "--priority",
+        type=int,
+        default=None,
+        help="Run RunSpecs with priority less than or equal to this number. "
+        "If a value for --priority is not specified, run on everything",
+    )
+    parser.add_argument(
+        "--run-specs",
+        nargs="*",
+        help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
+        "Specifies run entries to run.",
+        default=[],
+    )
+    parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
+    parser.add_argument(
+        "--enable-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
+        "Format: namespace/model_name[@revision]",
+    )
+    parser.add_argument(
+        "--enable-local-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
+    )
+    # reeval parameters
+    parser.add_argument(
+        "--model-ability",
+        type=float,
+        default=0.0,
+        help="The initial ability of the model for reeval evaluation.",
+    )
+    add_run_args(parser)
+    args = parser.parse_args()
+    validate_args(args)
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
+    if args.enable_huggingface_models:
+        from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
+        for huggingface_model_name in args.enable_huggingface_models:
+            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+    if args.enable_local_huggingface_models:
+        from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
+        for huggingface_model_path in args.enable_local_huggingface_models:
+            register_huggingface_local_model_from_flag_value(huggingface_model_path)
+    run_entries: List[RunEntry] = []
+    if args.conf_paths:
+        run_entries.extend(read_run_entries(args.conf_paths).entries)
+    if args.run_entries:
+        run_entries.extend(
+            [RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
+        )
+    # TODO: Remove this eventually.
+    if args.run_specs:
+        run_entries.extend(
+            [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
+        )
+    # Must set benchmark output path before getting RunSpecs,
+    # because run spec functions can use the benchmark output directory for caching.
+    ensure_directory_exists(args.output_path)
+    set_benchmark_output_path(args.output_path)
+    # Validate the --models-to-run flag
+    if args.models_to_run:
+        all_models = set(model_metadata_registry.get_all_models())
+        for model_to_run in args.models_to_run:
+            if model_to_run not in all_models:
+                raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
+    else:
+        model_expander_pattern = re.compile(
+            r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b"  # noqa: E501
+        )
+        if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
+            raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
+    run_specs = run_entries_to_run_specs(
+        run_entries=run_entries,
+        max_eval_instances=args.max_eval_instances,
+        num_train_trials=args.num_train_trials,
+        models_to_run=args.models_to_run,
+        groups_to_run=args.groups_to_run,
+        priority=args.priority,
+    )
+    hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
+    if len(run_specs) == 0:
+        hlog("There were no RunSpecs or they got filtered out.")
+        return
+    # Add reeval_parameters
+    run_specs = [
+        replace(
+            run_spec,
+            adapter_spec=replace(
+                run_spec.adapter_spec, reeval_parameters=REEvalParameters(model_ability=args.model_ability)
+            ),
+        )
+        for run_spec in run_specs
+    ]
+    auth: Authentication = (
+        Authentication("") if args.skip_instances or not args.server_url else create_authentication(args)
+    )
+    run_benchmarking(
+        run_specs=run_specs,
+        auth=auth,
+        url=args.server_url,
+        local_path=args.local_path,
+        num_threads=args.num_threads,
+        output_path=args.output_path,
+        suite=args.suite,
+        dry_run=args.dry_run,
+        skip_instances=args.skip_instances,
+        cache_instances=args.cache_instances,
+        cache_instances_only=args.cache_instances_only,
+        skip_completed_runs=args.skip_completed_runs,
+        exit_on_error=args.exit_on_error,
+        runner_class_name="helm.benchmark.reeval_runner.REEvalRunner",
+        mongo_uri=args.mongo_uri,
+        disable_cache=args.disable_cache,
+    )
+    if args.run_specs:
+        hlog(
+            "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
+            "Use --run-entries instead."
+        )
+    hlog("Done.")
+if __name__ == "__main__":
+    main()

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl