PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

crfm-helm 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -9,12 +9,10 @@ Usage:
 """
 import argparse
-import cattrs
 import os
 import datetime
 import urllib.parse
 import json
-import yaml
 from collections import defaultdict
 from dataclasses import dataclass, replace
 from statistics import mean, median
@@ -35,8 +33,6 @@ from helm.common.codec import from_json
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
-from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric import get_all_stats_by_name
 from helm.benchmark.metrics.statistic import Stat, merge_stat
@@ -58,9 +54,6 @@ from helm.benchmark.presentation.run_display import write_run_display_json
 from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
-OVERLAP_N_COUNT = 13
 @dataclass(frozen=True)
 class ExecutiveSummary:
     """
@@ -226,17 +219,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
     """
     assert aggregation in ["mean", "median"]
     win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
-    for i, header_cell in enumerate(table.header):
+    for column_index, header_cell in enumerate(table.header):
         lower_is_better = header_cell.lower_is_better
         if lower_is_better is None:  # column does not have a meaningful ordering
             continue
-        values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
-        if len(values) < 2:  # don't rank a single model
+        value_to_count: Dict[float, int] = defaultdict(int)
+        for row in table.rows:
+            value = row[column_index].value
+            if value is not None:
+                value_to_count[value] += 1
+        value_to_wins: Dict[float, float] = {}
+        acc_count = 0
+        for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
+            value_to_wins[value] = acc_count + ((value_count - 1) / 2)
+            acc_count += value_count
+        total_count = acc_count
+        if total_count < 2:
             continue
-        for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
-            win_rate = wins / (len(values) - 1)  # normalize to [0, 1]
-            win_rates_per_row[j].append(win_rate)
+        for row_index, row in enumerate(table.rows):
+            value = row[column_index].value
+            if value is not None:
+                win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
     # Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
     # Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
@@ -251,7 +254,44 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
     return aggregate_win_rates
-AGGREGATE_WIN_RATE_COLUMN = 1
+def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
+    """
+    Computes the aggregate mean of each row across columns.
+    Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
+    non-null values of the row are in columns we skip).
+    """
+    row_means: List[Optional[float]] = []
+    # check for all header cells where specified, that lower_is_better is consistent
+    orderings = []
+    for elem in table.header:
+        orderings.append(elem.lower_is_better)
+    if len(set(orderings)) != 1:
+        raise Exception("Cannot mean columns with different values for lower_is_better")
+    for row in table.rows:
+        total = 0.0
+        count = 0
+        for cell in row:
+            if cell.value is not None:
+                total += float(cell.value)
+                count += 1
+        if count == 0:
+            row_means.append(None)
+        else:
+            row_means.append(total / count)
+    return row_means
+class AggregationStrategy:
+    # TODO: Convert to StrEnum after upgrading to Python 3.11
+    WIN_RATE = "win_rate"
+    MEAN = "mean"
+ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
 class Summarizer:
@@ -483,137 +523,6 @@ class Summarizer:
         for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
             self.read_runs_for_suite(suite, run_suite_path)
-    def read_overlap_stats(self):
-        """
-        Load the overlap stats in the run suite path.
-        Concretely:
-            - get group -> scenario_spec information from self.runs
-                run_spec data
-            - read the files in the data_overlap directory in run_suite_path
-                which are scenario_spec -> overlap ids
-            - get aggregate stats for group -> overlap ratio
-        """
-        def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
-            scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
-            for run_spec in run_specs:
-                scenario_spec = run_spec.scenario_spec
-                groups = run_spec.groups
-                if (
-                    scenario_spec.class_name
-                    != "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
-                ):
-                    scenario_specs_to_groups[scenario_spec] = groups
-            group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
-            for scenario_spec, groups in scenario_specs_to_groups.items():
-                for group in groups:
-                    if group not in group_to_scenario_specs:
-                        group_to_scenario_specs[group] = []
-                    group_to_scenario_specs[group].append(scenario_spec)
-            return group_to_scenario_specs
-        def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
-            """
-            Takes the data_overlap_dir as input and returns a dictionary
-            of stats_file_path -> List(model_names)
-            Sample input:
-            file_models_mapping:
-            - file_name: file1
-                model_names:
-                - model1
-                - model2
-            - file_name: file2
-                model_names:
-                - model2
-                - model3
-            """
-            metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
-            if not os.path.exists(metadata_file_path):
-                return {}
-            with open(metadata_file_path, "r") as yaml_file:
-                data = yaml.safe_load(yaml_file)
-            file_metadata: Dict[str, List[str]] = {}
-            for entry in data["file_models_mapping"]:
-                if "file_name" in entry and "model_names" in entry:
-                    file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
-                    file_metadata[file_path] = entry["model_names"]
-            return file_metadata
-        # TODO: Delete this after @andyzorigin's project is done.
-        self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
-        data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
-        if not os.path.isdir(data_overlap_dir):
-            hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
-            return
-        group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
-        stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
-        for file_path, model_names in stats_file_metadata.items():
-            overlap_stats_jsons = open(file_path, "r").readlines()
-            data_overlap_stats_list: List[DataOverlapStats] = []
-            for overlap_stats_json in overlap_stats_jsons:
-                overlap_stats_dict = json.loads(overlap_stats_json)
-                data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
-            scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
-            for data_overlap_stats in data_overlap_stats_list:
-                data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
-                n = data_overlap_stats_key.overlap_protocol_spec.n
-                if n == OVERLAP_N_COUNT:
-                    light_scenario_key = data_overlap_stats_key.light_scenario_key
-                    scenario_spec = light_scenario_key.scenario_spec
-                    if scenario_spec in self.scenario_spec_instance_id_dict:
-                        # Get statistics based on the subset of instance_ids that HELM uses for a scenario
-                        instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
-                        num_instances = len(instance_ids)
-                        num_overlapping_inputs = len(
-                            set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
-                        )
-                        num_overlapping_references = len(
-                            set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
-                        )
-                        scenario_spec_overlap_counts[scenario_spec] = (
-                            num_instances,
-                            num_overlapping_inputs,
-                            num_overlapping_references,
-                        )
-            for group, scenario_specs in group_to_scenario_specs.items():
-                group_num_instances = 0
-                group_num_overlapping_inputs = 0
-                group_num_overlapping_references = 0
-                for scenario_spec in scenario_specs:
-                    if scenario_spec in scenario_spec_overlap_counts:
-                        (
-                            num_instances,
-                            num_overlapping_inputs,
-                            num_overlapping_references,
-                        ) = scenario_spec_overlap_counts[scenario_spec]
-                        group_num_instances += num_instances
-                        group_num_overlapping_inputs += num_overlapping_inputs
-                        group_num_overlapping_references += num_overlapping_references
-                if group_num_instances != 0:
-                    group_overlap_stats = GroupOverlapStats(
-                        group=group,
-                        num_instances=group_num_instances,
-                        num_overlapping_inputs=group_num_overlapping_inputs,
-                        num_overlapping_references=group_num_overlapping_references,
-                    )
-                    for model_name in model_names:
-                        # Assume model name will only be associated with single group overlap list for now
-                        # can update to join lists if need arises
-                        self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
     @htrack(None)
     def check_metrics_defined(self):
         """Check that all the metrics that appear in stats are defined."""
@@ -880,7 +789,7 @@ class Summarizer:
         sort_by_model_order: bool = True,
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
-        add_win_rate: bool = False,
+        aggregation_strategies: List[str] = [],
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1016,16 +925,6 @@ class Summarizer:
                 description = ""
-                group_overlap_stats = None
-                if (model_name, group_name) in self._model_group_overlap_stats:
-                    group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
-                    description = (
-                        f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
-                        f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
-                        f"{description}"
-                    )
                 # HACK: we want to hide stats for the following model-metric combinations:
                 # 1. Calibration metrics + AI21/Anthropic
                 # 2. MSMARCO metrics + AI21/Anthropic
@@ -1063,21 +962,44 @@ class Summarizer:
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
-        if add_win_rate:
-            # add overall win rate as the second column
-            WIN_RATE_AGGREGATION = "mean"
-            win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-            description = "How many models this model outperform on average (over columns)."
-            table.header.insert(
-                AGGREGATE_WIN_RATE_COLUMN,
-                HeaderCell(
-                    f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
-                    description=description,
-                    lower_is_better=False,
-                ),
-            )
-            for row, win_rate in zip(table.rows, win_rates):
-                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
+        aggregate_header_cells: List[HeaderCell] = []
+        aggregate_row_values: List[List[Optional[float]]] = []
+        for strategy in aggregation_strategies:
+            if strategy == AggregationStrategy.WIN_RATE:
+                WIN_RATE_AGGREGATION = "mean"
+                win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
+                description = "How many models this model outperforms on average (over columns)."
+                aggregate_header_cells.append(
+                    HeaderCell(
+                        f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
+                        description=description,
+                        lower_is_better=False,
+                    )
+                )
+                aggregate_row_values.append(win_rates)
+            elif strategy == AggregationStrategy.MEAN:
+                means = compute_aggregate_row_means(table)
+                description = "An average over columns representing the mean performance."
+                aggregate_header_cells.append(
+                    HeaderCell(
+                        "Mean performance",
+                        description=description,
+                        lower_is_better=table.header[0].lower_is_better,
+                    )
+                )
+                aggregate_row_values.append(means)
+            else:
+                raise Exception(
+                    f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
+                )
+        for i in range(len(aggregate_header_cells)):
+            aggregate_header_cell = aggregate_header_cells[i]
+            aggregate_rows = aggregate_row_values[i]
+            table.header.insert(i + 1, aggregate_header_cell)
+            for row, row_val in zip(table.rows, aggregate_rows):
+                row.insert(i + 1, Cell(row_val))
         if bold_columns:
             for i, header_cell in enumerate(table.header):
@@ -1125,14 +1047,22 @@ class Summarizer:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
-                display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
+                metric_group_config = self.schema.name_to_metric_group[metric_group]
+                display_name = metric_group_config.get_short_display_name()
+                aggregate_strategies: List[str]
+                if metric_group_config.aggregation_strategies is not None:
+                    aggregate_strategies = metric_group_config.aggregation_strategies
+                elif metric_group_config.hide_win_rates:
+                    aggregate_strategies = []
+                else:
+                    aggregate_strategies = [AggregationStrategy.WIN_RATE]
                 table = self.create_group_table(
                     name=metric_group,
                     title=display_name,
                     adapter_to_runs=adapter_to_runs,
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
-                    add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
+                    aggregation_strategies=aggregate_strategies,
                 )
                 tables.append(table)
         return tables
@@ -1262,72 +1192,6 @@ class Summarizer:
         parallel_map(process, self.runs, parallelism=self.num_threads)
-    def read_scenario_spec_instance_ids(self, num_instances) -> None:
-        """
-        This file checks if there exists a file, scenario_spec_instance_ids.json
-        that it can read the instance_ids associated with scenario_specs.
-        It will write the num_instances used in the run as part of the file name
-        If it doesn't exist, it will go through all the scenario_state files
-        and parse the instance_ids and output it to the file for future uses
-        Only when the scenario_specs for the data overlap script change
-        (or num_instances are different), will this need to be rerun.
-        In such cases, do not include the file as part of the data_overlap directory.
-        """
-        self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
-        data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
-        if not os.path.isdir(data_overlap_dir):
-            hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
-            return
-        scenario_spec_instance_ids_json = os.path.join(
-            data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
-        )
-        if not os.path.exists(scenario_spec_instance_ids_json):
-            hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
-            self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
-        else:
-            hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
-            scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
-            for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
-                scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
-                scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
-                self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
-                    scenario_spec_instance_ids.instance_ids
-                )
-    def write_scenario_spec_instance_ids_json(self, file_path) -> None:
-        for run in self.runs:
-            run_spec = run.run_spec
-            scenario_spec = run_spec.scenario_spec
-            if scenario_spec in self.scenario_spec_instance_id_dict:
-                continue
-            run_path = run.run_path
-            instances_file_path = os.path.join(run_path, "instances.json")
-            with open(instances_file_path, "r") as f:
-                raw_instances = json.load(f)
-            # Optimization: Don't structure to dataclass, since we only need to read `id`
-            instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
-            self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
-        all_scenario_spec_instance_ids = []
-        for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
-            scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
-            all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
-        with open(file_path, "w") as f:
-            f.writelines(
-                f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
-                for scenario_spec_instance_ids in all_scenario_spec_instance_ids
-            )
     def symlink_latest(self) -> None:
         # Create a symlink runs/latest -> runs/<name_of_suite>,
         # so runs/latest always points to the latest run suite.
@@ -1339,7 +1203,7 @@ class Summarizer:
             os.unlink(symlink_path)
         os.symlink(os.path.basename(self.run_release_path), symlink_path)
-    def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
+    def run_pipeline(self, skip_completed: bool) -> None:
         """Run the entire summarization pipeline."""
         self.read_runs()
         self.group_runs()
@@ -1347,14 +1211,6 @@ class Summarizer:
         self.write_run_display_json(skip_completed)
-        # Must happen after summarizer.write_run_display_json()
-        # because it uses instances.json files
-        self.read_scenario_spec_instance_ids(num_instances)
-        # Must happen after summarizer.read_scenario_spec_instance_ids()
-        # because it uses self.scenario_spec_instance_id_dict
-        self.read_overlap_stats()
         # Must happen after self.read_runs()
         # because it uses self.runs
         self.write_schema()
@@ -1404,12 +1260,6 @@ def main():
         action="store_true",
         help="Skip write_run_display_json() for runs which already have all output display JSON files",
     )
-    parser.add_argument(
-        "-num-instances",
-        type=int,
-        help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
-        default=1000,
-    )
     parser.add_argument(
         "--local-path",
         type=str,
@@ -1461,7 +1311,7 @@ def main():
         num_threads=args.num_threads,
         allow_unknown_models=args.allow_unknown_models,
     )
-    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
+    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
     hlog("Done.")

helm/benchmark/presentation/test_run_entry.py CHANGED Viewed

@@ -16,6 +16,7 @@ class TestRunEntry:
     @pytest.mark.parametrize("fname", list_fnames())
     def test_read_all_specs(self, fname: str):
+        pytest.skip("Skipping slow tests")
         run_entries = read_run_entries([fname])
         for entry in run_entries.entries:
             construct_run_specs(parse_object_spec(entry.description))

helm/benchmark/presentation/test_summarize.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
 import tempfile
-from helm.benchmark.presentation.summarize import Summarizer
+from helm.benchmark.presentation.summarize import Summarizer, compute_aggregate_row_win_rates
 from helm.benchmark.presentation.schema import get_default_schema_path
+from helm.benchmark.presentation.table import Cell, HeaderCell, Table
 from helm.common.general import ensure_directory_exists
@@ -19,7 +20,7 @@ def test_summarize_suite():
             num_threads=4,
             allow_unknown_models=True,
         )
-        summarizer.run_pipeline(skip_completed=True, num_instances=1000)
+        summarizer.run_pipeline(skip_completed=True)
         assert os.path.isfile(os.path.join(output_path, "runs", "test_suite", "groups.json"))
@@ -37,5 +38,146 @@ def test_summarize_release():
             num_threads=4,
             allow_unknown_models=True,
         )
-        summarizer.run_pipeline(skip_completed=True, num_instances=1000)
+        summarizer.run_pipeline(skip_completed=True)
         assert os.path.isfile(os.path.join(output_path, "releases", "test_release", "groups.json"))
+def test_compute_win_rates_one_scenario():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+    ]
+    values = [
+        ["Model A", 1],
+        ["Model B", 2],
+        ["Model C", 3],
+        ["Model D", 4],
+        ["Model E", 5],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
+def test_compute_win_rates_two_scenarios():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+        HeaderCell(value="Scenario B", lower_is_better=False),
+    ]
+    values = [
+        ["Model A", 1, 3],
+        ["Model B", 2, 1],
+        ["Model C", 3, 2],
+        ["Model D", 4, 5],
+        ["Model E", 5, 4],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [0.25, 0.125, 0.375, 0.875, 0.875]
+def test_compute_win_rates_incomplete_values():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+        HeaderCell(value="Scenario B", lower_is_better=False),
+    ]
+    values = [
+        ["Model A", 1, 3],
+        ["Model B", 2, 1],
+        ["Model C", 3, None],
+        ["Model D", 4, None],
+        ["Model E", 5, None],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [0.5, 0.125, 0.5, 0.75, 1]
+def test_compute_win_rates_ignore_nones():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+        HeaderCell(value="Scenario B", lower_is_better=False),
+        HeaderCell(value="Scenario C", lower_is_better=False),
+    ]
+    values = [
+        ["Model A", 1, None, None],
+        ["Model B", 2, None, 1],
+        ["Model C", 3, None, None],
+        ["Model D", 4, None, None],
+        ["Model E", 5, None, None],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
+def test_compute_win_rates_ignore_unset_lower_is_better():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+        HeaderCell(value="Scenario B"),
+    ]
+    values = [
+        ["Model A", 1, 3],
+        ["Model B", 2, 1],
+        ["Model C", 3, 2],
+        ["Model D", 4, 5],
+        ["Model E", 5, 4],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
+def test_compute_win_rates_no_win_rate():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+    ]
+    values = [
+        ["Model A", None],
+        ["Model B", None],
+        ["Model C", None],
+        ["Model D", None],
+        ["Model E", None],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [None, None, None, None, None]
+def test_compute_win_rates_ties():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=False),
+    ]
+    values = [
+        ["Model A", 1],
+        ["Model B", 1],
+        ["Model C", 1],
+        ["Model D", 4],
+        ["Model E", 5],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [0.25, 0.25, 0.25, 0.75, 1.0]
+def test_compute_win_rates_lower_is_better():
+    header = [
+        HeaderCell(value="Model"),
+        HeaderCell(value="Scenario A", lower_is_better=True),
+    ]
+    values = [
+        ["Model A", 1],
+        ["Model B", 2],
+        ["Model C", 3],
+        ["Model D", 4],
+        ["Model E", 5],
+    ]
+    rows = [[Cell(value) for value in row_values] for row_values in values]
+    table = Table(title="Test Table", header=header, rows=rows)
+    assert compute_aggregate_row_win_rates(table) == [1, 0.75, 0.5, 0.25, 0]

helm/benchmark/run.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import argparse
 from dataclasses import replace
 import os
+import re
 from typing import List, Optional
+from helm.benchmark import model_metadata_registry
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
@@ -314,6 +316,19 @@ def main():
     ensure_directory_exists(args.output_path)
     set_benchmark_output_path(args.output_path)
+    # Validate the --models-to-run flag
+    if args.models_to_run:
+        all_models = set(model_metadata_registry.get_all_models())
+        for model_to_run in args.models_to_run:
+            if model_to_run not in all_models:
+                raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
+    else:
+        model_expander_pattern = re.compile(
+            r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b"  # noqa: E501
+        )
+        if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
+            raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
     run_specs = run_entries_to_run_specs(
         run_entries=run_entries,
         max_eval_instances=args.max_eval_instances,

crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl