PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/reeval_runner.py ADDED Viewed

@@ -0,0 +1,355 @@
+import dacite
+import json
+import os
+import typing
+from collections import Counter
+from typing import Any, Dict, List, Optional
+import torch
+from tqdm import tqdm
+from dataclasses import replace
+from datasets import load_dataset
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.common.general import ensure_directory_exists, write, asdict_without_nones
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.cache import cache_stats
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    create_scenario,
+    Instance,
+    get_scenario_cache_path,
+    with_instance_ids,
+)
+from helm.benchmark.adaptation.adapters.adapter import Adapter
+from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.run_spec import RunSpec
+from helm.benchmark.data_preprocessor import DataPreprocessor
+from helm.benchmark.executor import ExecutionSpec
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
+from helm.benchmark.runner import (
+    Runner,
+    remove_stats_nans,
+    remove_per_instance_stats_nans,
+)
+scenario_to_metric_name = {
+    "air_bench_2024": "air_score",
+    "babi_qa": "quasi_exact_match",
+    "bbq": "quasi_exact_match",
+    "blimp": "exact_match",
+    "boolq": "quasi_exact_match",
+    "civil_comments": "quasi_exact_match",
+    "dyck_language": "exact_match_indicator",
+    "entity_data_imputation": "quasi_exact_match",
+    "entity_matching": "quasi_exact_match",
+    "imdb": "quasi_exact_match",
+    "legal_support": "quasi_exact_match",
+    "raft": "quasi_exact_match",
+    "synthetic_reasoning": "quasi_exact_match",
+    "truthful_qa": "exact_match",
+    "wikifact": "quasi_exact_match",
+    "mmlu": "exact_match",
+    "commonsense": "exact_match",
+    "gsm": "final_number_exact_match",
+    # "gsm": ["exact_match_indicator", "final_number_exact_match"],
+    "legalbench": "quasi_exact_match",
+    "math": "math_equiv_chain_of_thought",
+    "med_qa": "quasi_exact_match",
+    "thai_exam": "exact_match",
+}
+class REEvalRunner(Runner):
+    """
+    This runner implements the basic (non-amortized) method described in the paper
+    `Reliable and Efficient Amortized Model-Based Evaluation`. This approach, which is
+    also known as Computerized Adaptive Testing (CAT) within the framework of Item Response
+    Theory (IRT), leverages adaptive testing to evaluate model performance.
+    The difficulties of the questions are provided in a HuggingFace repository. In addition,
+    the authors of the paper will supply a Python package for calculating these difficulties.
+    At each iteration, the runner estimates the model's ability based on all previously
+    administered questions and their corresponding responses. It then selects the next question
+    whose difficulty is closest to the estimated ability, thereby reliably and efficiently
+    eliciting the model's ability.
+    """
+    def __init__(
+        self,
+        execution_spec: ExecutionSpec,
+        output_path: str,
+        suite: str,
+        skip_instances: bool,
+        cache_instances: bool,
+        cache_instances_only: bool,
+        skip_completed_runs: bool,
+        exit_on_error: bool,
+    ):
+        super().__init__(
+            execution_spec=execution_spec,
+            output_path=output_path,
+            suite=suite,
+            skip_instances=skip_instances,
+            cache_instances=cache_instances,
+            cache_instances_only=cache_instances_only,
+            skip_completed_runs=skip_completed_runs,
+            exit_on_error=exit_on_error,
+        )
+    def _estimate_model_ability(
+        self,
+        old_ability: float,
+        response_correctness: List[float],
+        instance_difficulties: List[float],
+    ) -> float:
+        def closure():
+            optim.zero_grad()
+            probs = torch.sigmoid(ability + difficulties)
+            loss = -torch.distributions.Bernoulli(probs=probs).log_prob(responses).mean()
+            loss.backward()
+            return loss
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        responses = torch.tensor(response_correctness, device=device)
+        difficulties = torch.tensor(instance_difficulties, device=device)
+        ability = torch.tensor([old_ability], requires_grad=True, device=device)
+        optim = torch.optim.LBFGS([ability], lr=0.1, max_iter=20, history_size=10, line_search_fn="strong_wolfe")
+        for iteration in range(100):
+            loss = optim.step(closure)
+            if iteration > 0:
+                prev_ability = ability.clone()
+                prev_loss = loss
+                d_loss = prev_loss - loss
+                d_theta = torch.norm(prev_ability - ability, p=2)
+                grad_norm = torch.norm(optim.param_groups[0]["params"][0].grad, p=2)
+                if d_loss < 1e-5 and d_theta < 1e-5 and grad_norm < 1e-5:
+                    break
+        return ability.item()
+    def run_one(self, run_spec: RunSpec):
+        run_path: str = self._get_run_path(run_spec)
+        if self.skip_completed_runs and self._is_run_completed(run_path):
+            hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
+            return
+        ensure_directory_exists(run_path)
+        # Load the scenario
+        scenario: Scenario = create_scenario(run_spec.scenario_spec)
+        # This 'output_path' will be used when the model's input instances are saved.
+        args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
+        scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
+        input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
+        input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
+        instances: List[Instance]
+        if self.skip_instances:
+            instances = []
+        else:
+            if self.cache_instances and os.path.exists(input_instances_file_path):
+                with open(input_instances_file_path) as f:
+                    json_instances: List[Dict[str, Any]] = json.load(f)
+                instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
+            else:
+                # Create the instances of the scenario
+                scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
+                with htrack_block("scenario.get_instances"):
+                    instances = scenario.get_instances(scenario_output_path)
+        if self.cache_instances and not os.path.exists(input_instances_file_path):
+            # Save instances to file
+            ensure_directory_exists(input_instances_output_path)
+            write(
+                os.path.join(input_instances_file_path),
+                json.dumps([asdict_without_nones(instance) for instance in instances], indent=2),
+            )
+        if self.cache_instances_only:
+            return  # Exit after saving the instances.
+        # Give each instance a unique ID
+        if any([instance.id is None for instance in instances]):
+            instances = with_instance_ids(instances)
+        # Data preprocessing
+        instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
+            instances, self.executor.execution_spec.parallelism
+        )
+        # Adapt (convert to requests)
+        adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
+        unasked_request_states_without_difficulty: List[RequestState] = adapter.adapt(
+            instances, self.executor.execution_spec.parallelism
+        )
+        # load difficulty
+        split_name = "dyck_language_np_3" if scenario.name == "dyck_language" else scenario.name
+        try:
+            difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
+            prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
+        except ValueError:
+            hlog(f"WARNING: no available difficulty for {split_name}, skipping")
+            return
+        unasked_request_states: List[RequestState] = []
+        for request_state in unasked_request_states_without_difficulty:
+            prompt = request_state.request.prompt
+            if prompt in prompt_to_difficulty:
+                difficulty = prompt_to_difficulty[prompt]
+                current_extra_data = request_state.instance.extra_data or {}
+                if "difficulty" in current_extra_data:
+                    raise Exception("Extra_data already contains a 'difficulty' key.")
+                new_extra_data = current_extra_data.copy()
+                new_extra_data["difficulty"] = difficulty
+                new_instance = replace(request_state.instance, extra_data=new_extra_data)
+                new_request_state = replace(request_state, instance=new_instance)
+                unasked_request_states.append(new_request_state)
+        assert unasked_request_states
+        # Execute the requests in an reeval manner
+        assert run_spec.adapter_spec.reeval_parameters is not None
+        model_ability: float = run_spec.adapter_spec.reeval_parameters.model_ability or 0.0
+        scenario_metric_name: str = scenario_to_metric_name[scenario.name]
+        asked_request_states: List[RequestState] = []
+        reeval_trajectory: Dict[str, List[float]] = {
+            "model_ability": [],
+            "response_correctness": [],
+            "instance_difficulties": [],
+        }
+        assert run_spec.adapter_spec.max_eval_instances is not None
+        for _ in tqdm(range(run_spec.adapter_spec.max_eval_instances), desc="REEval Execution"):
+            if not unasked_request_states:
+                break
+            selected_item: Optional[RequestState] = None
+            min_diff = float("inf")
+            for item in unasked_request_states:
+                assert item.instance.extra_data is not None
+                diff = abs(item.instance.extra_data["difficulty"] + model_ability)
+                if diff < min_diff:
+                    min_diff = diff
+                    selected_item = item
+            assert selected_item is not None
+            unasked_request_states.remove(selected_item)
+            # Execute the request
+            single_scenario_state: ScenarioState = ScenarioState(
+                adapter_spec=run_spec.adapter_spec,
+                request_states=[selected_item],
+                annotator_specs=run_spec.annotators,
+            )
+            # Execute (fill up results)
+            single_scenario_state = self.executor.execute(single_scenario_state)
+            # Annotate (post-process the results)
+            single_scenario_state = self.annotator_executor.execute(single_scenario_state)
+            # Apply the metrics
+            # When performing a dry run, only estimate the number of tokens instead
+            # of calculating the metrics.
+            metrics: List[MetricInterface] = (
+                [DryRunMetric()]
+                if self.dry_run
+                else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
+            )
+            temp_per_instance_stats: List[PerInstanceStats] = []
+            with htrack_block(f"{len(metrics)} metrics"):
+                for metric in metrics:
+                    with htrack_block(metric):
+                        temp_metric_result: MetricResult = metric.evaluate(
+                            single_scenario_state,
+                            self.metric_service,
+                            self.eval_cache_path,
+                            self.executor.execution_spec.parallelism,
+                        )
+                        temp_per_instance_stats.extend(temp_metric_result.per_instance_stats)
+            # Update the reeval request states
+            asked_request_states.extend(single_scenario_state.request_states)
+            # Update the reeval trajectory
+            reeval_trajectory["model_ability"].append(model_ability)
+            scenario_metric_value = [
+                s for s in temp_per_instance_stats[0].stats if s.name.name == scenario_metric_name
+            ][0].mean
+            assert scenario_metric_value is not None
+            reeval_trajectory["response_correctness"].append(scenario_metric_value)
+            assert selected_item.instance.extra_data is not None
+            reeval_trajectory["instance_difficulties"].append(selected_item.instance.extra_data["difficulty"])
+            # Estimate the model ability
+            model_ability = self._estimate_model_ability(
+                old_ability=model_ability,
+                response_correctness=reeval_trajectory["response_correctness"],
+                instance_difficulties=reeval_trajectory["instance_difficulties"],
+            )
+        # Create the scenario state
+        scenario_state: ScenarioState = ScenarioState(
+            adapter_spec=run_spec.adapter_spec,
+            request_states=asked_request_states,
+            annotator_specs=run_spec.annotators,
+        )
+        stats: List[Stat] = []
+        per_instance_stats: List[PerInstanceStats] = []
+        with htrack_block(f"{len(metrics)} metrics"):
+            for metric in metrics:
+                with htrack_block(metric):
+                    metric_result: MetricResult = metric.evaluate(
+                        scenario_state,
+                        self.metric_service,
+                        self.eval_cache_path,
+                        self.executor.execution_spec.parallelism,
+                    )
+                    stats.extend(metric_result.aggregated_stats)
+                    per_instance_stats.extend(metric_result.per_instance_stats)
+        # Check that there aren't duplicate `Stat`s
+        # Note: doesn't catch near misses.
+        metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
+        for metric_name, count in metric_counts.items():
+            if count > 1:
+                hlog(f"WARNING: duplicate metric name {metric_name}")
+        # Print out the number of stats
+        hlog(f"Generated {len(stats)} stats.")
+        if self.skip_instances:
+            hlog("skip_instances was True. Skipping writing results out.")
+            return
+        # Output benchmarking information and results to files
+        write(os.path.join(run_path, "run_spec.json"), json.dumps(asdict_without_nones(run_spec), indent=2))
+        # Write out scenario
+        write(os.path.join(run_path, "scenario.json"), json.dumps(asdict_without_nones(scenario), indent=2))
+        # Write scenario state
+        write(os.path.join(run_path, "scenario_state.json"), json.dumps(asdict_without_nones(scenario_state), indent=2))
+        write(
+            os.path.join(run_path, "stats.json"),
+            json.dumps([asdict_without_nones(stat) for stat in remove_stats_nans(stats)], indent=2),
+        )
+        write(
+            os.path.join(run_path, "per_instance_stats.json"),
+            json.dumps(list(map(asdict_without_nones, remove_per_instance_stats_nans(per_instance_stats))), indent=2),
+        )
+        write(
+            os.path.join(run_path, "reeval_trajectory.json"),
+            json.dumps(reeval_trajectory, indent=2),
+        )
+        cache_stats.print_status()

helm/benchmark/run.py CHANGED Viewed

@@ -266,13 +266,6 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
-    parser.add_argument(
-        "--openvino",
-        action="store_true",
-        default=False,
-        help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
-        "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
-    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
@@ -284,19 +277,13 @@ def main():
         from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
         for huggingface_model_name in args.enable_huggingface_models:
-            if args.openvino:
-                register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
-            else:
-                register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
     if args.enable_local_huggingface_models:
         from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
         for huggingface_model_path in args.enable_local_huggingface_models:
-            if args.openvino:
-                register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
-            else:
-                register_huggingface_local_model_from_flag_value(huggingface_model_path)
+            register_huggingface_local_model_from_flag_value(huggingface_model_path)
     run_entries: List[RunEntry] = []
     if args.conf_paths:
@@ -323,12 +310,16 @@ def main():
             if model_to_run not in all_models:
                 raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
     else:
-        model_expander_pattern = re.compile(
+        model_expander_wildcard_pattern = re.compile(
             r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b"  # noqa: E501
         )
-        if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
+        if any(model_expander_wildcard_pattern.search(run_entry.description) for run_entry in run_entries):
             raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
+        model_expander_pattern = re.compile(r"\bmodel=\b")
+        if not any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
+            raise Exception("--models-to-run must be set if the `models=` run expander is omitted")
     run_specs = run_entries_to_run_specs(
         run_entries=run_entries,
         max_eval_instances=args.max_eval_instances,

helm/benchmark/run_expander.py CHANGED Viewed

@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
 from dataclasses import replace
 from typing import Any, List, Dict, Optional, Tuple, Type
+from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.model_metadata_registry import (
     get_all_instruction_following_models,
     get_all_code_models,
@@ -11,19 +12,21 @@ from helm.benchmark.model_metadata_registry import (
     get_model_metadata,
     get_model_names_with_tag,
     DEPRECATED_MODEL_TAG,
+    UNSUPPORTED_MODEL_TAG,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
     TEXT_TO_IMAGE_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
+    AUDIO_LANGUAGE_MODEL_TAG,
     INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
-from .run_spec import RunSpec
+from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
-from .augmentations.perturbation import PerturbationSpec
-from .augmentations.data_augmenter import DataAugmenterSpec
+from helm.benchmark.augmentations.perturbation import PerturbationSpec
+from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
 from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
@@ -347,6 +350,29 @@ class AnthropicClaude3RunExpander(RunExpander):
         return [run_spec]
+class NovaRunExpander(RunExpander):
+    """
+    Custom prompt for Amazon Nova models.
+    These models need more explicit instructions about following the format.
+    """
+    name = "amazon-nova"
+    PROMPT = "Do not provide any additional explanation. Follow the format shown in the provided examples strictly."
+    def __init__(self):
+        pass
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(run_spec.adapter_spec, global_prefix=NovaRunExpander.PROMPT + "\n\n"),
+            ),
+        ]
 class FollowFormatInstructionsRunExpander(RunExpander):
     """Adds more explicit instructions about following the format to prompts.
@@ -588,6 +614,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
             "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
             "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
             "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
+            "audiolm": get_model_names_with_tag(AUDIO_LANGUAGE_MODEL_TAG),
         }
         # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
@@ -604,8 +631,10 @@ class ModelRunExpander(ReplaceValueRunExpander):
         # For each of the keys above, filter out deprecated models.
         deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
+        unsupported_models = set(get_model_names_with_tag(UNSUPPORTED_MODEL_TAG))
+        excluded_models = deprecated_models | unsupported_models
         for family_name in values_dict.keys():
-            values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
+            values_dict[family_name] = [model for model in values_dict[family_name] if model not in excluded_models]
         return values_dict
@@ -1424,14 +1453,20 @@ class OutputFormatInstructions(RunExpander):
     name = "output_format_instructions"
     _SUFFIX_SUFFIX = "_suffix"
+    _NO_PREFIX_SUFFIX = "_no_prefix"
     def __init__(self, scenario: str):
+        self.suffix = False
         if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
-            self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
+            scenario = scenario.removesuffix(OutputFormatInstructions._SUFFIX_SUFFIX)
             self.suffix = True
-        else:
-            self.scenario = scenario
-            self.suffix = False
+        self.no_prefix = False
+        if scenario.endswith(OutputFormatInstructions._NO_PREFIX_SUFFIX):
+            scenario = scenario.removesuffix(OutputFormatInstructions._NO_PREFIX_SUFFIX)
+            self.no_prefix = True
+        self.scenario = scenario
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
@@ -1441,6 +1476,8 @@ class OutputFormatInstructions(RunExpander):
                 instructions = "Answer with only a single letter."
             elif self.scenario == "mcqa":
                 instructions = "Answer with only a single letter."
+            elif self.scenario == "mcqa_only_last_question":
+                instructions = "Answer only the last question with only a single letter."
             else:
                 instructions = "Answer with only a single letter."
         elif run_spec.adapter_spec.method == ADAPT_GENERATION:
@@ -1452,6 +1489,8 @@ class OutputFormatInstructions(RunExpander):
                 )
             elif self.scenario == "natural_qa":
                 instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
+            elif self.scenario == "natural_qa_short_answer":
+                instructions = "Answer with a short answer."
             elif self.scenario == "legalbench":
                 if output_noun != "Answer":
                     instructions = f"Answer with the {output_noun.lower()}."
@@ -1483,6 +1522,11 @@ class OutputFormatInstructions(RunExpander):
             else:
                 raise ValueError(f"Unknown scenario {self.scenario}")
+        if self.no_prefix:
+            if instructions:
+                instructions += " "
+            instructions += f"Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
         if self.suffix:
             return [
                 replace(
@@ -1506,6 +1550,31 @@ class OutputFormatInstructions(RunExpander):
         ]
+class ProcessOutputRunExpander(RunExpander):
+    name = "process_output"
+    def __init__(self, processor: str):
+        self.processor = processor
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        output_processing_metric_spec = MetricSpec(
+            class_name="helm.benchmark.metrics.output_processing_metric.OutputProcessingMetric",
+            args={
+                "processor": self.processor,
+                "metric_specs": [
+                    {"class_name": metric_spec.class_name, "args": metric_spec.args}
+                    for metric_spec in run_spec.metric_specs
+                ],
+            },
+        )
+        return [
+            replace(
+                run_spec,
+                metric_specs=[output_processing_metric_spec],
+            ),
+        ]
 RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     InstructionsRunExpander,
     PromptRunExpander,
@@ -1532,6 +1601,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     TemperatureRunExpander,
     IncreaseTemperatureRunExpander,
     IncreaseMaxTokensRunExpander,
+    ProcessOutputRunExpander,
 ]

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -37,6 +37,7 @@ from helm.benchmark.run_expander import (
     IncreaseTemperatureRunExpander,
     IncreaseMaxTokensRunExpander,
     LlavaRunExpander,
+    ModelRunExpander,
     OpenFlamingoRunExpander,
     StopRunExpander,
 )
@@ -61,6 +62,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
     expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS]  # type: ignore
     args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
+    # If no model run expander was specified, add the model=all run expander
+    if not any([expander for expander in expanders if isinstance(expander, ModelRunExpander)]):
+        expanders.append(ModelRunExpander("all"))
     run_specs: List[RunSpec] = [run_spec_function(**args)]
     # Apply expanders
@@ -138,6 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
         ):
             run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
+        if model.name == "openai/o1-2024-12-17":
+            # From https://platform.openai.com/docs/guides/reasoning,
+            # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
+            # experimenting with these models. As you become familiar with the number of reasoning tokens your
+            # prompts require, you can adjust this buffer accordingly."
+            run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
         # IDEFICS special handling
         if IDEFICS_MODEL_TAG in model.tags:
             if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:

helm/benchmark/run_specs/air_bench_run_specs.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from typing import Dict, Optional
 from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
 from helm.benchmark.annotation.annotator import AnnotatorSpec
 from helm.benchmark.metrics.metric import MetricSpec
@@ -6,7 +8,10 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
 @run_spec_function("air_bench_2024")
-def get_air_bench_2024_spec() -> RunSpec:
+def get_air_bench_2024_spec(
+    annotator_model: Optional[str] = None, annotator_model_deployment: Optional[str] = None
+) -> RunSpec:
+    run_spec_name = "air_bench_2024"
     adapter_spec = AdapterSpec(
         method=ADAPT_GENERATION,
         global_prefix="",
@@ -24,14 +29,27 @@ def get_air_bench_2024_spec() -> RunSpec:
         stop_sequences=[],
     )
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
-    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
+    annotator_args: Dict[str, str] = {}
+    if annotator_model:
+        annotator_args["model"] = annotator_model
+        annotator_args["model_deployment"] = annotator_model_deployment or annotator_model
+        run_spec_name = (
+            "air_bench_2024:"
+            f"annotator_model={annotator_args['model']},"
+            f"annotator_model_deployment={annotator_args['model_deployment']}"
+        )
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator", args=annotator_args
+        )
+    ]
     metric_specs = [
         MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
         MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
         MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
     ]
     return RunSpec(
-        name="air_bench_2024",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl