PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -21,7 +21,8 @@ from statistics import mean, median
 from typing import List, Optional, Dict, Any, Tuple, Set
 from tqdm import tqdm
+from helm.benchmark.model_deployment_registry import get_model_deployment
+from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
 from helm.common.general import (
     write,
     ensure_directory_exists,
@@ -39,25 +40,22 @@ from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric import get_all_stats_by_name
 from helm.benchmark.metrics.statistic import Stat, merge_stat
-from helm.benchmark.runner import RunSpec, LATEST_SYMLINK
+from helm.benchmark.run_spec import RunSpec
+from helm.benchmark.runner import LATEST_SYMLINK
 from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
 from helm.benchmark.presentation.schema import (
     MetricNameMatcher,
     RunGroup,
+    Field,
     read_schema,
-    SCHEMA_YAML_FILENAME,
+    get_default_schema_path,
     BY_GROUP,
     THIS_GROUP_ONLY,
     NO_GROUPS,
 )
-from helm.benchmark.presentation.contamination import (
-    read_contamination,
-    validate_contamination,
-    CONTAMINATION_SYMBOLS,
-    CONTAMINATION_STYLES,
-    CONTAMINATION_LEVEL_STRONG,
-)
+from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
 from helm.benchmark.presentation.run_display import write_run_display_json
+from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
 OVERLAP_N_COUNT = 13
@@ -139,6 +137,38 @@ def get_scenario_name(group: RunGroup, scenario_spec: ScenarioSpec):
     return group.name + "_" + dict_to_str(scenario_spec.args).replace(" ", "").replace("/", "_")
+def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetadata:
+    """Return the ModelMetadata for the model in the given AdapterSpec."""
+    # Get model metadata based on `model` in `adapter_spec`
+    try:
+        return get_model_metadata(adapter_spec.model)
+    except ValueError:
+        pass
+    # Get model metadata based on `model_deployment` in `adapter_spec`
+    try:
+        model_deployment = get_model_deployment(adapter_spec.model_deployment)
+        if model_deployment.model_name:
+            return get_model_metadata(model_deployment.model_name)
+    except ValueError:
+        pass
+    # In some cases, some models were renamed such that the old model name is now the model deployment name
+    # For instance, the model called "huggingface/gpt2" is now called "openai/gpt2", but its model deployment
+    # is still called "huggingface/gpt2".
+    # Handle these cases here.
+    # TODO: Delete this block eventually.
+    try:
+        model_deployment = get_model_deployment(adapter_spec.model)
+        if model_deployment.model_name:
+            return get_model_metadata(model_deployment.model_name)
+    except ValueError:
+        pass
+    # Return a placeholder "unknown model" model metadata.
+    return get_unknown_model_metadata(adapter_spec.model)
 def get_coarse_adapter_spec(
     adapter_spec: AdapterSpec, scenario_spec: Optional[ScenarioSpec] = None, adapter_keys_shown: List[str] = []
 ) -> AdapterSpec:
@@ -165,7 +195,7 @@ def get_coarse_adapter_spec(
     # Create a new adapter_spec, keeping only the model and the keys in adapter_keys_shown
     adapter_spec_kwargs = {key: adapter_spec.__dict__[key] for key in adapter_keys_shown}
-    return AdapterSpec(**adapter_spec_kwargs)  # type: ignore
+    return AdapterSpec(**adapter_spec_kwargs)
 def get_method_display_name(model_display_name: Optional[str], info: Dict[str, Any]) -> str:
@@ -178,6 +208,8 @@ def get_method_display_name(model_display_name: Optional[str], info: Dict[str, A
     info = dict(info)
     if "model" in info:
         del info["model"]
+    if "model_deployment" in info:
+        del info["model_deployment"]
     return (model_display_name or "???") + (f" [{dict_to_str(info)}]" if len(info) > 0 else "")
@@ -199,15 +231,7 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
         if lower_is_better is None:  # column does not have a meaningful ordering
             continue
-        # sort row indices by cell value and then compute the number of wins as the index in the sorted list
-        def is_cell_valid(cell: Cell) -> bool:  # ignore cells which are strongly contaminated or have no value
-            if cell.value is None:
-                return False
-            if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
-                return False
-            return True
-        values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
+        values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
         if len(values) < 2:  # don't rank a single model
             continue
         for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
@@ -270,9 +294,11 @@ class Summarizer:
         release: Optional[str],
         suites: Optional[List[str]],
         suite: Optional[str],
+        schema_path: str,
         output_path: str,
         verbose: bool,
         num_threads: int,
+        allow_unknown_models: bool,
     ):
         """
         A note on the relation between `release`, `suites`, and `suite`:
@@ -288,6 +314,7 @@ class Summarizer:
         self.suites: List[str]
         self.run_suite_paths: List[str]
         self.suite: Optional[str] = None
+        self.schema_path = schema_path
         self.release: Optional[str] = None
         if suite:
             self.suite = suite
@@ -301,12 +328,11 @@ class Summarizer:
             self.run_suite_paths = [os.path.join(output_path, "runs", suite) for suite in suites]
         self.verbose: bool = verbose
         self.num_threads: int = num_threads
+        self.allow_unknown_models: bool = allow_unknown_models
         ensure_directory_exists(self.run_release_path)
-        self.schema = read_schema()
-        self.contamination = read_contamination()
-        validate_contamination(self.contamination, self.schema)
+        self.schema = read_schema(schema_path)
     def read_run(self, run_path: str) -> Run:
         """Load the `Run` object from `run_path`."""
@@ -334,7 +360,7 @@ class Summarizer:
                 if run_group_name not in self.schema.name_to_run_group:
                     hlog(
                         f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
-                        f"but undefined in {SCHEMA_YAML_FILENAME}, skipping"
+                        f"but undefined in {self.schema_path}, skipping"
                     )
                     continue
                 run_group = self.schema.name_to_run_group[run_group_name]
@@ -355,7 +381,13 @@ class Summarizer:
         """Load the runs in the run suite path."""
         # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
         # so filter them out.
-        run_dir_names = sorted([p for p in os.listdir(run_suite_path) if p != "eval_cache" and p != "groups"])
+        run_dir_names = sorted(
+            [
+                p
+                for p in os.listdir(run_suite_path)
+                if p != "eval_cache" and p != "groups" and os.path.isdir(os.path.join(run_suite_path, p))
+            ]
+        )
         for run_dir_name in tqdm(run_dir_names, disable=None):
             run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
@@ -363,13 +395,8 @@ class Summarizer:
                 hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
                 continue
             run_path: str = os.path.join(run_suite_path, run_dir_name)
-            self.runs.append(self.read_run(run_path))
-        # For each group (e.g., natural_qa), map
-        # (i) scenario spec (e.g., subject=philosophy) [optional] and
-        # (ii) adapter spec (e.g., model = openai/davinci)
-        # to list of runs
-        for run in self.runs:
+            run = self.read_run(run_path)
+            self.runs.append(run)
             if run.run_spec.name in self.runs_to_run_suites:
                 hlog(
                     f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
@@ -377,12 +404,75 @@ class Summarizer:
                 )
             self.runs_to_run_suites[run.run_spec.name] = suite
+    def group_runs(self):
+        # For each group (e.g., natural_qa), map
+        # (i) scenario spec (e.g., subject=philosophy) [optional] and
+        # (ii) adapter spec (e.g., model = openai/davinci)
+        # to list of runs
+        for run in self.runs:
             scenario_spec = run.run_spec.scenario_spec
             adapter_spec = run.run_spec.adapter_spec
             for group_name in run.run_spec.groups:
                 self.group_adapter_to_runs[group_name][adapter_spec].append(run)
                 self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
+    @dataclass(frozen=True)
+    class _ModelField(Field):
+        """The frontend version of ModelMetadata.
+        The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
+        All attributes have the same meaning as in ModelMetadata."""
+        # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
+        creator_organization: Optional[str] = None
+        access: Optional[str] = None
+        todo: bool = False
+        release_date: Optional[str] = None
+        num_parameters: Optional[int] = None
+    def get_model_field_dicts(self) -> List[Dict]:
+        """Get a list of `ModelField`s dicts that will be written to schema.json.
+        The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
+        This is populated by reading the `ModelMetadata` configs and filtering down to models that were
+        actually used, and converting each `ModelMetadata` to a `ModelField`."""
+        # TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
+        used_model_names: Set[str] = set()
+        for run in self.runs:
+            used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
+        model_field_dicts: List[Dict] = []
+        for model_name in get_all_models():
+            if model_name not in used_model_names:
+                continue
+            model_metadata = get_model_metadata(model_name)
+            model_field = Summarizer._ModelField(
+                name=model_metadata.name,
+                display_name=model_metadata.display_name,
+                short_display_name=model_metadata.display_name,
+                description=model_metadata.description,
+                creator_organization=model_metadata.creator_organization_name,
+                access=model_metadata.access,
+                todo=False,
+                release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
+                num_parameters=model_metadata.num_parameters,
+            )
+            model_field_dicts.append(asdict_without_nones(model_field))
+        return model_field_dicts
+    def write_schema(self) -> None:
+        """Write the schema file to benchmark_output so the frontend knows about it."""
+        # Manually add the model metadata to the schema.json, where the frontend expects it.
+        # TODO: Move model metadata out of schema.json into its own model_metadata.json file.
+        raw_schema = asdict_without_nones(self.schema)
+        raw_schema["models"] = self.get_model_field_dicts()
+        write(
+            os.path.join(self.run_release_path, "schema.json"),
+            json.dumps(raw_schema, indent=2),
+        )
     def read_runs(self):
         self.runs: List[Run] = []
         self.runs_to_run_suites: Dict[str, str] = {}
@@ -455,6 +545,7 @@ class Summarizer:
             return file_metadata
+        # TODO: Delete this after @andyzorigin's project is done.
         self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
         data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
@@ -537,7 +628,7 @@ class Summarizer:
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
                 hlog(
-                    f"WARNING: metric name {metric_name} undefined in {SCHEMA_YAML_FILENAME} "
+                    f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
@@ -564,12 +655,12 @@ class Summarizer:
         # TODO: move to write_executive_summary()
         models_to_costs: Dict[str, Dict[str]] = defaultdict(lambda: defaultdict(int))
         for run in self.runs:
-            model: str = run.run_spec.adapter_spec.model
+            deployment: str = run.run_spec.adapter_spec.model_deployment
             for stat in run.stats:
                 stat_name = stat.name.name
                 if stat_name in Summarizer.COST_REPORT_FIELDS and not stat.name.split:
-                    models_to_costs[model][stat_name] += stat.sum
+                    models_to_costs[deployment][stat_name] += stat.sum
         # Do a second pass to add up the total number of tokens
         for costs in models_to_costs.values():
@@ -639,7 +730,8 @@ class Summarizer:
             header = [
                 HeaderCell("Group"),
                 HeaderCell("Description"),
-                # Synchronize these names with `schema.yaml`
+                # Synchronize these names with the appropriate schema file
+                # TODO: different schema files might have different fields (for multimodal)
                 HeaderCell("Adaptation method", description="Adaptation strategy (e.g., generation)"),
                 HeaderCell("# instances", description="Number of instances evaluated on"),
                 HeaderCell("# references", description="Number of references provided per instance"),
@@ -660,7 +752,7 @@ class Summarizer:
                 for subgroup in self.expand_subgroups(group):
                     for adapter_spec, runs in self.group_adapter_to_runs[subgroup.name].items():
                         filtered_runs = self.filter_runs_by_visibility(runs, subgroup)
-                        models.add(adapter_spec.model)
+                        models.add(adapter_spec.model_deployment)
                         methods.add(adapter_spec.method)
                         for run in filtered_runs:
                             num_instances.extend(get_all_stats_by_name(run.stats, "num_instances"))
@@ -668,9 +760,6 @@ class Summarizer:
                             num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
                             num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
-                if len(num_instances) == 0:
-                    continue
                 rows.append(
                     [
                         Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
@@ -704,9 +793,9 @@ class Summarizer:
         self,
         runs: List[Run],
         matcher: MetricNameMatcher,
-        contamination_level: Optional[str],
         additional_info: Optional[str],
         hide_value: bool = False,
+        is_scenario_table: bool = False,
     ) -> Cell:
         """
         Use the metric name identified by `matcher` to pull out the stats from
@@ -760,18 +849,33 @@ class Summarizer:
         if self.verbose:
             description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
-        style: Dict[str, Any] = {}
-        if contamination_level is not None:
-            style = CONTAMINATION_STYLES.get(contamination_level, style)
-        return Cell(value=value, description=description, style=style, contamination_level=contamination_level)
+        # Link the runs that this cell was aggregated from, if this is not a scenario table.
+        # Scenario tables link to the runs in the model cells,
+        # whereas non-scenario tables link to the runs in the metrics cells.
+        run_spec_names: Optional[List] = None
+        if not is_scenario_table:
+            # Deduplicate run spec names becuase aggregated_run_specs may have duplicated
+            # run specs if a run spec belongs to multiple groups.
+            run_spec_names = []
+            run_spec_names_set = set()
+            for run_spec_name in aggregated_run_specs:
+                if run_spec_name not in run_spec_names_set:
+                    run_spec_names.append(run_spec_name)
+                    run_spec_names_set.add(run_spec_name)
+        return Cell(
+            value=value,
+            description=description,
+            style={},
+            run_spec_names=run_spec_names,
+        )
     def create_group_table(
         self,
         name: str,
         title: str,
         adapter_to_runs: Dict[AdapterSpec, List[Run]],
-        link_to_runs: bool,
+        is_scenario_table: bool,
         columns: List[Tuple[RunGroup, str]],  # run_group, metric_group
         sort_by_model_order: bool = True,
         sub_split: Optional[str] = None,
@@ -810,7 +914,7 @@ class Summarizer:
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {SCHEMA_YAML_FILENAME}, skipping")
+                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -863,55 +967,45 @@ class Summarizer:
         adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
         if sort_by_model_order:
-            # Sort models by the order defined in the schema.
-            # Models not defined in the schema will be sorted alphabetically and
-            # placed before models in defined the schema.
-            model_order = [model.name for model in self.schema.models]
+            # Sort models by the order defined in the the model metadata config.
+            # Models not defined in the model metadata config will be sorted alphabetically and
+            # placed before models in defined the model metadata config.
+            model_order = get_all_models()
             def _adapter_spec_sort_key(spec):
-                index = model_order.index(spec.model) if spec.model in model_order else -1
-                return (index, spec.model)
+                index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
+                return (index, spec.model_deployment)
             adapter_specs = list(sorted(adapter_specs, key=_adapter_spec_sort_key))
         # Pull out only the keys of the method adapter_spec that is needed to
         # uniquely identify the method.
-        infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model"])
+        infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model_deployment", "model"])
         assert len(adapter_specs) == len(infos), [adapter_specs, infos]
         # Populate the contents of the table
         rows = []
         for adapter_spec, info in zip(adapter_specs, infos):
-            model_name: str = adapter_spec.model
-            # Get the model display name from the schema.
-            # Fall back to using the model name as the model display name if the model is not
-            # defined in the schema.
-            model_display_name = (
-                self.schema.name_to_model[model_name].display_name
-                if model_name in self.schema.name_to_model
-                else model_name
-            )
+            model_metadata = get_model_metadata_for_adapter_spec(adapter_spec)
+            model_name: str = model_metadata.name
             runs = adapter_to_runs[adapter_spec]
-            display_name = get_method_display_name(model_display_name, info)
+            display_name = get_method_display_name(model_metadata.display_name, info)
-            # Link to all the runs under this model
-            if link_to_runs:
+            # Link the runs that this row was aggregated from, if this is a scenario table.
+            # Scenario tables link to the runs in the model cells,
+            # whereas non-scenario tables link to the runs in the metrics cells.
+            run_spec_names: Optional[List[str]]
+            if is_scenario_table:
                 run_spec_names = [run.run_spec.name for run in runs]
                 href = run_spec_names_to_url(run_spec_names)
             else:
+                run_spec_names = None
                 href = None
-            # Render contamination information
-            point = self.contamination.get_point(model_name, columns[0][0].name)
-            if num_groups == 1 and point is not None:  # display contamination information at the adapter level
-                cells = [
-                    Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
-                ]
-            else:
-                cells = [Cell(display_name, description="", href=href)]
+            cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
             assert len(group_names) == len(matchers)
             for group_name, matcher in zip(group_names, matchers):
                 group_runs = [run for run in runs if group_name in run.run_spec.groups]
@@ -920,13 +1014,7 @@ class Summarizer:
                 if "babi" in group_name and "task:" not in name:
                     group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
-                point = self.contamination.get_point(model_name, group_name)
-                if point is not None:
-                    description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
-                    contamination_level = point.level
-                else:
-                    description = ""
-                    contamination_level = None
+                description = ""
                 group_overlap_stats = None
                 if (model_name, group_name) in self._model_group_overlap_stats:
@@ -948,9 +1036,9 @@ class Summarizer:
                     self.create_cell(
                         group_runs,
                         matcher,
-                        contamination_level,
                         additional_info=description,
                         hide_value=hide_value,
+                        is_scenario_table=is_scenario_table,
                     )
                 )
@@ -960,7 +1048,7 @@ class Summarizer:
         # There could be a ton of runs, so only do this if there are 2-5
         # TODO: replace in frontend with a selector to choose which rows to visualize.
         links = []
-        if link_to_runs:
+        if is_scenario_table:
             all_run_spec_names = []
             for adapter_spec, runs in adapter_to_runs.items():
                 if len(runs) > 1:
@@ -1043,8 +1131,8 @@ class Summarizer:
                     title=display_name,
                     adapter_to_runs=adapter_to_runs,
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
-                    link_to_runs=False,
-                    add_win_rate=True,
+                    is_scenario_table=False,
+                    add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
                 )
                 tables.append(table)
         return tables
@@ -1075,7 +1163,7 @@ class Summarizer:
                         name=scenario_name,
                         adapter_to_runs=adapter_to_runs,
                         columns=columns,
-                        link_to_runs=True,
+                        is_scenario_table=True,
                     )
                     tables.append(table)
                     scenarios_shown += 1
@@ -1087,7 +1175,7 @@ class Summarizer:
                                 name=f"{subgroup.name}:sub_split={sub_split}",
                                 adapter_to_runs=adapter_to_runs,
                                 columns=columns,
-                                link_to_runs=False,
+                                is_scenario_table=False,
                                 sub_split=sub_split,
                             )
                             tables.append(table)
@@ -1107,7 +1195,7 @@ class Summarizer:
                         name=subgroup.name,
                         adapter_to_runs=adapter_to_runs,
                         columns=columns,
-                        link_to_runs=False,
+                        is_scenario_table=False,
                     )
                     tables = [table] + tables
             all_tables.extend(tables)
@@ -1209,9 +1297,9 @@ class Summarizer:
             for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
                 scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
                 scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
-                self.scenario_spec_instance_id_dict[
-                    scenario_spec_instance_ids.scenario_spec
-                ] = scenario_spec_instance_ids.instance_ids
+                self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
+                    scenario_spec_instance_ids.instance_ids
+                )
     def write_scenario_spec_instance_ids_json(self, file_path) -> None:
         for run in self.runs:
@@ -1249,11 +1337,12 @@ class Summarizer:
         if os.path.islink(symlink_path):
             # Remove the previous symlink if it exists.
             os.unlink(symlink_path)
-        os.symlink(os.path.abspath(self.run_release_path), symlink_path)
+        os.symlink(os.path.basename(self.run_release_path), symlink_path)
     def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
-        """Run the entire summarization pipeline pipeline."""
+        """Run the entire summarization pipeline."""
         self.read_runs()
+        self.group_runs()
         self.check_metrics_defined()
         self.write_run_display_json(skip_completed)
@@ -1266,6 +1355,10 @@ class Summarizer:
         # because it uses self.scenario_spec_instance_id_dict
         self.read_overlap_stats()
+        # Must happen after self.read_runs()
+        # because it uses self.runs
+        self.write_schema()
         self.write_executive_summary()
         self.write_runs()
         self.write_run_specs()
@@ -1276,12 +1369,17 @@ class Summarizer:
         self.symlink_latest()
-@htrack(None)
+@htrack("summarize")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
     )
+    parser.add_argument(
+        "--schema-path",
+        type=str,
+        help="Path to the schema file (e.g., schema_classic.yaml).",
+    )
     parser.add_argument(
         "--suite",
         type=str,
@@ -1312,6 +1410,18 @@ def main():
         help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
         default=1000,
     )
+    parser.add_argument(
+        "--local-path",
+        type=str,
+        help="If running locally, the path for `ServerService`.",
+        default="prod_env",
+    )
+    parser.add_argument(
+        "--allow-unknown-models",
+        type=bool,
+        help="Whether to allow unknown models in the metadata file",
+        default=True,
+    )
     args = parser.parse_args()
     release: Optional[str] = None
@@ -1335,14 +1445,21 @@ def main():
     else:
         raise ValueError("Exactly one of --release or --suite must be specified.")
+    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
     # Output JSON files summarizing the benchmark results which will be loaded in the web interface
     summarizer = Summarizer(
         release=release,
         suites=suites,
         suite=suite,
+        schema_path=schema_path,
         output_path=args.output_path,
         verbose=args.debug,
         num_threads=args.num_threads,
+        allow_unknown_models=args.allow_unknown_models,
     )
     summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
     hlog("Done.")

helm/benchmark/presentation/table.py CHANGED Viewed

@@ -4,26 +4,26 @@ from typing import Any, Optional, List, Dict
 @dataclass(frozen=True)
 class Cell:
-    # Semantic value (that can be used for sorting)
     value: Any
+    """Semantic value (that can be used for sorting)"""
-    # Optionally, if we want to render things specially (floating points to 3 decimal points)
     display_value: Optional[str] = None
+    """Optionally, if we want to render things specially (floating points to 3 decimal points)"""
-    # Detailed description if hover over the cell
     description: Optional[str] = None
+    """Detailed description if hover over the cell"""
-    # If we click on the link for this cell, it takes us somewhere
     href: Optional[str] = None
+    """If we click on the link for this cell, it takes us somewhere"""
-    # Styling
     style: Optional[Dict[str, Any]] = None
+    """Styling"""
-    # If the value or display_value is markdown that needs to be interpreted
     markdown: bool = False
+    """If the value or display_value is markdown that needs to be interpreted"""
-    # How much train-test contamination affects the cell's value (`contamination.CONTAMINATION_LEVEL_{WEAK/STRONG}`)
-    contamination_level: Optional[str] = None
+    run_spec_names: Optional[List[str]] = None
+    """The names of the runs that this cell's value was aggregated from, if the cell contains an aggregate value."""
 @dataclass(frozen=True)

helm/benchmark/presentation/test_contamination.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from helm.benchmark.presentation.schema import read_schema
+from helm.benchmark.presentation.schema import read_schema, get_default_schema_path
 from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
 def test_contamination_schema():
-    schema = read_schema()
+    schema = read_schema(get_default_schema_path())
     contamination = read_contamination()
     validate_contamination(contamination, schema)

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl