PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -22,13 +22,11 @@ from typing import List, Optional, Dict, Any, Tuple, Set
 from tqdm import tqdm
 from helm.benchmark.model_deployment_registry import get_model_deployment
 from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
 from helm.common.general import (
     write,
     ensure_directory_exists,
     asdict_without_nones,
-    serialize_dates,
     parallel_map,
     singleton,
     unique_simplification,
@@ -42,27 +40,22 @@ from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric import get_all_stats_by_name
 from helm.benchmark.metrics.statistic import Stat, merge_stat
-from helm.benchmark.runner import RunSpec, LATEST_SYMLINK
+from helm.benchmark.run_spec import RunSpec
+from helm.benchmark.runner import LATEST_SYMLINK
 from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
 from helm.benchmark.presentation.schema import (
     MetricNameMatcher,
     RunGroup,
+    Field,
     read_schema,
-    SCHEMA_CLASSIC_YAML_FILENAME,
+    get_default_schema_path,
     BY_GROUP,
     THIS_GROUP_ONLY,
     NO_GROUPS,
 )
-from helm.benchmark.presentation.contamination import (
-    read_contamination,
-    validate_contamination,
-    CONTAMINATION_SYMBOLS,
-    CONTAMINATION_STYLES,
-    CONTAMINATION_LEVEL_STRONG,
-)
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
 from helm.benchmark.presentation.run_display import write_run_display_json
-from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
+from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
 OVERLAP_N_COUNT = 13
@@ -172,7 +165,7 @@ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetad
     except ValueError:
         pass
-    # Return a placeholder "unknoown model" model metadata.
+    # Return a placeholder "unknown model" model metadata.
     return get_unknown_model_metadata(adapter_spec.model)
@@ -238,15 +231,7 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
         if lower_is_better is None:  # column does not have a meaningful ordering
             continue
-        # sort row indices by cell value and then compute the number of wins as the index in the sorted list
-        def is_cell_valid(cell: Cell) -> bool:  # ignore cells which are strongly contaminated or have no value
-            if cell.value is None:
-                return False
-            if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
-                return False
-            return True
-        values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
+        values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
         if len(values) < 2:  # don't rank a single model
             continue
         for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
@@ -309,7 +294,7 @@ class Summarizer:
         release: Optional[str],
         suites: Optional[List[str]],
         suite: Optional[str],
-        schema_file: str,
+        schema_path: str,
         output_path: str,
         verbose: bool,
         num_threads: int,
@@ -329,7 +314,7 @@ class Summarizer:
         self.suites: List[str]
         self.run_suite_paths: List[str]
         self.suite: Optional[str] = None
-        self.schema_file = schema_file
+        self.schema_path = schema_path
         self.release: Optional[str] = None
         if suite:
             self.suite = suite
@@ -347,9 +332,7 @@ class Summarizer:
         ensure_directory_exists(self.run_release_path)
-        self.schema = read_schema(schema_file)
-        self.contamination = read_contamination()
-        validate_contamination(self.contamination, self.schema)
+        self.schema = read_schema(schema_path)
     def read_run(self, run_path: str) -> Run:
         """Load the `Run` object from `run_path`."""
@@ -377,7 +360,7 @@ class Summarizer:
                 if run_group_name not in self.schema.name_to_run_group:
                     hlog(
                         f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
-                        f"but undefined in {self.schema_file}, skipping"
+                        f"but undefined in {self.schema_path}, skipping"
                     )
                     continue
                 run_group = self.schema.name_to_run_group[run_group_name]
@@ -433,11 +416,61 @@ class Summarizer:
                 self.group_adapter_to_runs[group_name][adapter_spec].append(run)
                 self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
-    def write_schema(self):
+    @dataclass(frozen=True)
+    class _ModelField(Field):
+        """The frontend version of ModelMetadata.
+        The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
+        All attributes have the same meaning as in ModelMetadata."""
+        # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
+        creator_organization: Optional[str] = None
+        access: Optional[str] = None
+        todo: bool = False
+        release_date: Optional[str] = None
+        num_parameters: Optional[int] = None
+    def get_model_field_dicts(self) -> List[Dict]:
+        """Get a list of `ModelField`s dicts that will be written to schema.json.
+        The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
+        This is populated by reading the `ModelMetadata` configs and filtering down to models that were
+        actually used, and converting each `ModelMetadata` to a `ModelField`."""
+        # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
+        used_model_names: Set[str] = set()
+        for run in self.runs:
+            used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
+        model_field_dicts: List[Dict] = []
+        for model_name in get_all_models():
+            if model_name not in used_model_names:
+                continue
+            model_metadata = get_model_metadata(model_name)
+            model_field = Summarizer._ModelField(
+                name=model_metadata.name,
+                display_name=model_metadata.display_name,
+                short_display_name=model_metadata.display_name,
+                description=model_metadata.description,
+                creator_organization=model_metadata.creator_organization_name,
+                access=model_metadata.access,
+                todo=False,
+                release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
+                num_parameters=model_metadata.num_parameters,
+            )
+            model_field_dicts.append(asdict_without_nones(model_field))
+        return model_field_dicts
+    def write_schema(self) -> None:
         """Write the schema file to benchmark_output so the frontend knows about it."""
+        # Manually add the model metadata to the schema.json, where the frontend expects it.
+        # TODO: Move model metadata out of schema.json into its own model_metadata.json file.
+        raw_schema = asdict_without_nones(self.schema)
+        raw_schema["models"] = self.get_model_field_dicts()
         write(
             os.path.join(self.run_release_path, "schema.json"),
-            json.dumps(asdict_without_nones(self.schema), indent=2, default=serialize_dates),
+            json.dumps(raw_schema, indent=2),
         )
     def read_runs(self):
@@ -512,6 +545,7 @@ class Summarizer:
             return file_metadata
+        # TODO: Delete this after @andyzorigin's project is done.
         self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
         data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
@@ -594,7 +628,7 @@ class Summarizer:
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
                 hlog(
-                    f"WARNING: metric name {metric_name} undefined in {self.schema_file} "
+                    f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
@@ -726,9 +760,6 @@ class Summarizer:
                             num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
                             num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
-                if len(num_instances) == 0:
-                    continue
                 rows.append(
                     [
                         Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
@@ -762,9 +793,9 @@ class Summarizer:
         self,
         runs: List[Run],
         matcher: MetricNameMatcher,
-        contamination_level: Optional[str],
         additional_info: Optional[str],
         hide_value: bool = False,
+        is_scenario_table: bool = False,
     ) -> Cell:
         """
         Use the metric name identified by `matcher` to pull out the stats from
@@ -818,18 +849,33 @@ class Summarizer:
         if self.verbose:
             description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
-        style: Dict[str, Any] = {}
-        if contamination_level is not None:
-            style = CONTAMINATION_STYLES.get(contamination_level, style)
-        return Cell(value=value, description=description, style=style, contamination_level=contamination_level)
+        # Link the runs that this cell was aggregated from, if this is not a scenario table.
+        # Scenario tables link to the runs in the model cells,
+        # whereas non-scenario tables link to the runs in the metrics cells.
+        run_spec_names: Optional[List] = None
+        if not is_scenario_table:
+            # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
+            # run specs if a run spec belongs to multiple groups.
+            run_spec_names = []
+            run_spec_names_set = set()
+            for run_spec_name in aggregated_run_specs:
+                if run_spec_name not in run_spec_names_set:
+                    run_spec_names.append(run_spec_name)
+                    run_spec_names_set.add(run_spec_name)
+        return Cell(
+            value=value,
+            description=description,
+            style={},
+            run_spec_names=run_spec_names,
+        )
     def create_group_table(
         self,
         name: str,
         title: str,
         adapter_to_runs: Dict[AdapterSpec, List[Run]],
-        link_to_runs: bool,
+        is_scenario_table: bool,
         columns: List[Tuple[RunGroup, str]],  # run_group, metric_group
         sort_by_model_order: bool = True,
         sub_split: Optional[str] = None,
@@ -868,7 +914,7 @@ class Summarizer:
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_file}, skipping")
+                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -921,10 +967,10 @@ class Summarizer:
         adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
         if sort_by_model_order:
-            # Sort models by the order defined in the schema.
-            # Models not defined in the schema will be sorted alphabetically and
-            # placed before models in defined the schema.
-            model_order = [model.name for model in self.schema.models]
+            # Sort models by the order defined in the the model metadata config.
+            # Models not defined in the model metadata config will be sorted alphabetically and
+            # placed before models in defined the model metadata config.
+            model_order = get_all_models()
             def _adapter_spec_sort_key(spec):
                 index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
@@ -948,21 +994,18 @@ class Summarizer:
             runs = adapter_to_runs[adapter_spec]
             display_name = get_method_display_name(model_metadata.display_name, info)
-            # Link to all the runs under this model
-            if link_to_runs:
+            # Link the runs that this row was aggregated from, if this is a scenario table.
+            # Scenario tables link to the runs in the model cells,
+            # whereas non-scenario tables link to the runs in the metrics cells.
+            run_spec_names: Optional[List[str]]
+            if is_scenario_table:
                 run_spec_names = [run.run_spec.name for run in runs]
                 href = run_spec_names_to_url(run_spec_names)
             else:
+                run_spec_names = None
                 href = None
-            # Render contamination information
-            point = self.contamination.get_point(model_name, columns[0][0].name)
-            if num_groups == 1 and point is not None:  # display contamination information at the adapter level
-                cells = [
-                    Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
-                ]
-            else:
-                cells = [Cell(display_name, description="", href=href)]
+            cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
             assert len(group_names) == len(matchers)
             for group_name, matcher in zip(group_names, matchers):
                 group_runs = [run for run in runs if group_name in run.run_spec.groups]
@@ -971,13 +1014,7 @@ class Summarizer:
                 if "babi" in group_name and "task:" not in name:
                     group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
-                point = self.contamination.get_point(model_name, group_name)
-                if point is not None:
-                    description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
-                    contamination_level = point.level
-                else:
-                    description = ""
-                    contamination_level = None
+                description = ""
                 group_overlap_stats = None
                 if (model_name, group_name) in self._model_group_overlap_stats:
@@ -999,9 +1036,9 @@ class Summarizer:
                     self.create_cell(
                         group_runs,
                         matcher,
-                        contamination_level,
                         additional_info=description,
                         hide_value=hide_value,
+                        is_scenario_table=is_scenario_table,
                     )
                 )
@@ -1011,7 +1048,7 @@ class Summarizer:
         # There could be a ton of runs, so only do this if there are 2-5
         # TODO: replace in frontend with a selector to choose which rows to visualize.
         links = []
-        if link_to_runs:
+        if is_scenario_table:
             all_run_spec_names = []
             for adapter_spec, runs in adapter_to_runs.items():
                 if len(runs) > 1:
@@ -1094,8 +1131,8 @@ class Summarizer:
                     title=display_name,
                     adapter_to_runs=adapter_to_runs,
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
-                    link_to_runs=False,
-                    add_win_rate=True,
+                    is_scenario_table=False,
+                    add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
                 )
                 tables.append(table)
         return tables
@@ -1126,7 +1163,7 @@ class Summarizer:
                         name=scenario_name,
                         adapter_to_runs=adapter_to_runs,
                         columns=columns,
-                        link_to_runs=True,
+                        is_scenario_table=True,
                     )
                     tables.append(table)
                     scenarios_shown += 1
@@ -1138,7 +1175,7 @@ class Summarizer:
                                 name=f"{subgroup.name}:sub_split={sub_split}",
                                 adapter_to_runs=adapter_to_runs,
                                 columns=columns,
-                                link_to_runs=False,
+                                is_scenario_table=False,
                                 sub_split=sub_split,
                             )
                             tables.append(table)
@@ -1158,7 +1195,7 @@ class Summarizer:
                         name=subgroup.name,
                         adapter_to_runs=adapter_to_runs,
                         columns=columns,
-                        link_to_runs=False,
+                        is_scenario_table=False,
                     )
                     tables = [table] + tables
             all_tables.extend(tables)
@@ -1260,9 +1297,9 @@ class Summarizer:
             for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
                 scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
                 scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
-                self.scenario_spec_instance_id_dict[
-                    scenario_spec_instance_ids.scenario_spec
-                ] = scenario_spec_instance_ids.instance_ids
+                self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
+                    scenario_spec_instance_ids.instance_ids
+                )
     def write_scenario_spec_instance_ids_json(self, file_path) -> None:
         for run in self.runs:
@@ -1304,8 +1341,6 @@ class Summarizer:
     def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
         """Run the entire summarization pipeline."""
-        self.write_schema()
         self.read_runs()
         self.group_runs()
         self.check_metrics_defined()
@@ -1320,6 +1355,10 @@ class Summarizer:
         # because it uses self.scenario_spec_instance_id_dict
         self.read_overlap_stats()
+        # Must happen after self.read_runs()
+        # because it uses self.runs
+        self.write_schema()
         self.write_executive_summary()
         self.write_runs()
         self.write_run_specs()
@@ -1337,10 +1376,9 @@ def main():
         "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
     )
     parser.add_argument(
-        "--schema-file",
+        "--schema-path",
         type=str,
-        help="File name of the schema to read (e.g., schema_classic.yaml).",
-        default=SCHEMA_CLASSIC_YAML_FILENAME,
+        help="Path to the schema file (e.g., schema_classic.yaml).",
     )
     parser.add_argument(
         "--suite",
@@ -1407,6 +1445,8 @@ def main():
     else:
         raise ValueError("Exactly one of --release or --suite must be specified.")
+    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
@@ -1415,7 +1455,7 @@ def main():
         release=release,
         suites=suites,
         suite=suite,
-        schema_file=args.schema_file,
+        schema_path=schema_path,
         output_path=args.output_path,
         verbose=args.debug,
         num_threads=args.num_threads,

helm/benchmark/presentation/table.py CHANGED Viewed

@@ -4,26 +4,26 @@ from typing import Any, Optional, List, Dict
 @dataclass(frozen=True)
 class Cell:
-    # Semantic value (that can be used for sorting)
     value: Any
+    """Semantic value (that can be used for sorting)"""
-    # Optionally, if we want to render things specially (floating points to 3 decimal points)
     display_value: Optional[str] = None
+    """Optionally, if we want to render things specially (floating points to 3 decimal points)"""
-    # Detailed description if hover over the cell
     description: Optional[str] = None
+    """Detailed description if hover over the cell"""
-    # If we click on the link for this cell, it takes us somewhere
     href: Optional[str] = None
+    """If we click on the link for this cell, it takes us somewhere"""
-    # Styling
     style: Optional[Dict[str, Any]] = None
+    """Styling"""
-    # If the value or display_value is markdown that needs to be interpreted
     markdown: bool = False
+    """If the value or display_value is markdown that needs to be interpreted"""
-    # How much train-test contamination affects the cell's value (`contamination.CONTAMINATION_LEVEL_{WEAK/STRONG}`)
-    contamination_level: Optional[str] = None
+    run_spec_names: Optional[List[str]] = None
+    """The names of the runs that this cell's value was aggregated from, if the cell contains an aggregate value."""
 @dataclass(frozen=True)

helm/benchmark/presentation/test_contamination.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME
+from helm.benchmark.presentation.schema import read_schema, get_default_schema_path
 from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
 def test_contamination_schema():
-    schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
+    schema = read_schema(get_default_schema_path())
     contamination = read_contamination()
     validate_contamination(contamination, schema)

helm/benchmark/presentation/test_run_entry.py CHANGED Viewed

@@ -3,8 +3,7 @@ import pytest
 from helm.common.object_spec import parse_object_spec
 from helm.benchmark.presentation.run_entry import read_run_entries
-from helm.benchmark.run_specs import construct_run_specs
-from helm.benchmark import vlm_run_specs  # noqa
+from helm.benchmark.run_spec_factory import construct_run_specs
 def list_fnames():

helm/benchmark/presentation/test_summarize.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import tempfile
 from helm.benchmark.presentation.summarize import Summarizer
-from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
+from helm.benchmark.presentation.schema import get_default_schema_path
 from helm.common.general import ensure_directory_exists
@@ -13,7 +13,7 @@ def test_summarize_suite():
             release=None,
             suites=None,
             suite="test_suite",
-            schema_file=SCHEMA_CLASSIC_YAML_FILENAME,
+            schema_path=get_default_schema_path(),
             output_path=output_path,
             verbose=False,
             num_threads=4,
@@ -31,7 +31,7 @@ def test_summarize_release():
             release="test_release",
             suites=["test_suite_1", "test_suite_2"],
             suite=None,
-            schema_file=SCHEMA_CLASSIC_YAML_FILENAME,
+            schema_path=get_default_schema_path(),
             output_path=output_path,
             verbose=False,
             num_threads=4,

helm/benchmark/run.py CHANGED Viewed

@@ -1,27 +1,26 @@
 import argparse
 from dataclasses import replace
+import os
 from typing import List, Optional
-from helm.benchmark.huggingface_registration import (
-    register_huggingface_hub_model_from_flag_value,
-    register_huggingface_local_model_from_flag_value,
-)
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
+from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec, get_class_by_name
 from helm.proxy.services.remote_service import create_authentication, add_service_args
+from helm.proxy.services.service import CACHE_DIR
 from helm.benchmark.config_registry import (
     register_configs_from_directory,
     register_builtin_configs_from_helm_package,
 )
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark import vlm_run_specs  # noqa
-from .executor import ExecutionSpec
-from .runner import Runner, RunSpec, LATEST_SYMLINK, set_benchmark_output_path
-from .run_specs import construct_run_specs
+from helm.benchmark.executor import ExecutionSpec
+from helm.benchmark.runner import Runner, RunSpec, LATEST_SYMLINK, set_benchmark_output_path
+from helm.benchmark.run_spec_factory import construct_run_specs
 def run_entries_to_run_specs(
@@ -85,16 +84,29 @@ def run_benchmarking(
     skip_completed_runs: bool,
     exit_on_error: bool,
     runner_class_name: Optional[str],
-    mongo_uri: str = "",
+    mongo_uri: Optional[str] = None,
+    disable_cache: Optional[bool] = None,
 ) -> List[RunSpec]:
     """Runs RunSpecs given a list of RunSpec descriptions."""
+    sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
+    mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
+    if not disable_cache:
+        if mongo_uri:
+            mongo_cache_backend_config = MongoCacheBackendConfig(mongo_uri)
+        else:
+            sqlite_cache_path = os.path.join(local_path, CACHE_DIR)
+            ensure_directory_exists(sqlite_cache_path)
+            sqlite_cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
     execution_spec = ExecutionSpec(
         auth=auth,
         url=url,
         local_path=local_path,
         parallelism=num_threads,
         dry_run=dry_run,
-        mongo_uri=mongo_uri,
+        sqlite_cache_backend_config=sqlite_cache_backend_config,
+        mongo_cache_backend_config=mongo_cache_backend_config,
     )
     with htrack_block("run_specs"):
         for run_spec in run_specs:
@@ -160,13 +172,6 @@ def add_run_args(parser: argparse.ArgumentParser):
         help="Name of the suite this run belongs to (default is today's date).",
         required=True,
     )
-    parser.add_argument(
-        "--local",
-        action="store_true",
-        help="DEPRECATED: Does nothing. Do not use. Previously enabled local mode. "
-        "Now does nothing and will be removed in the next released version. "
-        "Local mode is enabled by default, and only disabled if the --server_url flag is set.",
-    )
     parser.add_argument(
         "--local-path",
         type=str,
@@ -179,6 +184,11 @@ def add_run_args(parser: argparse.ArgumentParser):
         help="If non-empty, the URL of the MongoDB database that will be used for caching instead of SQLite",
         default="",
     )
+    parser.add_argument(
+        "--disable-cache",
+        action="store_true",
+        help="If true, the request-response cache for model clients and tokenizers will be disabled.",
+    )
 def validate_args(args):
@@ -227,7 +237,14 @@ def main():
         help="Run RunSpecs with priority less than or equal to this number. "
         "If a value for --priority is not specified, run on everything",
     )
-    parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
+    parser.add_argument(
+        "--run-specs",
+        nargs="*",
+        help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
+        "Specifies run entries to run.",
+        default=[],
+    )
+    parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
     parser.add_argument(
         "--enable-huggingface-models",
         nargs="+",
@@ -254,14 +271,25 @@ def main():
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
-    for huggingface_model_name in args.enable_huggingface_models:
-        register_huggingface_hub_model_from_flag_value(huggingface_model_name)
-    for huggingface_model_path in args.enable_local_huggingface_models:
-        register_huggingface_local_model_from_flag_value(huggingface_model_path)
+    if args.enable_huggingface_models:
+        from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
+        for huggingface_model_name in args.enable_huggingface_models:
+            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+    if args.enable_local_huggingface_models:
+        from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
+        for huggingface_model_path in args.enable_local_huggingface_models:
+            register_huggingface_local_model_from_flag_value(huggingface_model_path)
     run_entries: List[RunEntry] = []
     if args.conf_paths:
         run_entries.extend(read_run_entries(args.conf_paths).entries)
+    if args.run_entries:
+        run_entries.extend(
+            [RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
+        )
+    # TODO: Remove this eventually.
     if args.run_specs:
         run_entries.extend(
             [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
@@ -306,13 +334,13 @@ def main():
         exit_on_error=args.exit_on_error,
         runner_class_name=args.runner_class_name,
         mongo_uri=args.mongo_uri,
+        disable_cache=args.disable_cache,
     )
-    if args.local:
+    if args.run_specs:
         hlog(
-            "WARNING: The --local flag is deprecated. It now does nothing and will be removed in "
-            "the next released version. Local mode is enabled by default, and only disabled if the "
-            "--server_url flag is set. Please remove --local from your command."
+            "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
+            "Use --run-entries instead."
         )
     hlog("Done.")

crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl