PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show

helm/benchmark/run_expander.py CHANGED Viewed

@@ -8,19 +8,22 @@ from helm.benchmark.model_metadata_registry import (
     get_all_code_models,
     get_all_models,
     get_all_text_models,
+    get_model_metadata,
     get_model_names_with_tag,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
+    TEXT_TO_IMAGE_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
+    INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
-from helm.common.general import handle_module_not_found_error
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
-from .runner import RunSpec
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
+from .run_spec import RunSpec
+from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
 from .augmentations.perturbation import PerturbationSpec
 from .augmentations.data_augmenter import DataAugmenterSpec
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
 class RunExpander(ABC):
@@ -223,12 +226,16 @@ class AddToStopRunExpander(RunExpander):
         self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if self.value == "newline":
+            stop_sequence = "\n"
+        else:
+            stop_sequence = self.value
         return [
             replace(
                 run_spec,
                 name=run_spec.name,
                 adapter_spec=replace(
-                    run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [self.value]
+                    run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
                 ),
             ),
         ]
@@ -273,33 +280,37 @@ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
 )
-class AnthropicRunExpander(RunExpander):
+class AnthropicClaude2RunExpander(RunExpander):
     """
-    Custom prompt for Anthropic models.
+    Custom prompt for Anthropic Claude 1 and Claude 2 models.
     These models need more explicit instructions about following the format.
     """
     name = "anthropic"
+    # These strings must be added to the prompt in order to pass prompt validation,
+    # otherwise the Anthropic API will return an error.
+    # See: https://docs.anthropic.com/claude/reference/prompt-validation
+    HUMAN_PROMPT = "\n\nHuman:"
+    AI_PROMPT = "\n\nAssistant:"
     def __init__(self):
         pass
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        try:
-            import anthropic
-        except ModuleNotFoundError as e:
-            handle_module_not_found_error(e, ["anthropic"])
         return [
             replace(
                 run_spec,
                 name=run_spec.name,
                 adapter_spec=replace(
                     run_spec.adapter_spec,
-                    global_prefix=anthropic.HUMAN_PROMPT + " " + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
+                    + " "
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
+                    + "\n\n",
                     global_suffix="\n\n"
                     + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + anthropic.AI_PROMPT
+                    + AnthropicClaude2RunExpander.AI_PROMPT
                     + " "
                     + run_spec.adapter_spec.output_prefix.strip(),
                 ),
@@ -307,21 +318,66 @@ class AnthropicRunExpander(RunExpander):
         ]
-class OpenAIRunExpander(RunExpander):
-    """
-    Custom prompt for OpenAI models.
-    These models need more explicit instructions about following the format.
-    """
+class AnthropicClaude3RunExpander(RunExpander):
+    """Custom prompts for Anthropic Claude 3 models."""
-    # TODO: Refactor out common logic between this and GoogleRunExpander.
+    name = "claude_3"
-    name = "openai"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        # Remove all stop sequences that do not contain non-whitespace characters.
+        # This prevents the Anthropic API from returnin the following error:
+        # "stop_sequences: each stop sequence must contain non-whitespace"
+        stop_sequences_with_non_whitespace = [
+            stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
+        ]
+        run_spec = replace(
+            run_spec,
+            adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
+        )
+        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
+            instructions = "Answer with only a single letter."
+            if run_spec.adapter_spec.instructions:
+                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                ),
+            ]
+        return [run_spec]
-    def __init__(self):
-        pass
+class FollowFormatInstructionsRunExpander(RunExpander):
+    """Adds more explicit instructions about following the format to prompts.
+    The argument controlls which models will receive these instructions.
+    If "all", all models receive these instructions.
+    If "instruct", only instruction-following models receive these instructions.
+    Only supports the generation adaptation method. Raises an error if used on
+    a RunSpec that uses a different adaptation method.
+    Note: For legacy backwards compatibility reasons, despite the use of the word
+    "instructions" in this run expander's name, this run expander actually
+    modifies the global_prefix and the global_suffix of the AdapterSpec rather than
+    the instructions.
+    """
+    name = "follow_format_instructions"
+    def __init__(self, value: str):
+        if value != "all" and value != "instruct":
+            raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
+        self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method != ADAPT_GENERATION:
+            raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
+        if (
+            self.value == "instruct"
+            and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
+        ):
             return [run_spec]
         return [
@@ -340,31 +396,70 @@ class OpenAIRunExpander(RunExpander):
         ]
-class GoogleRunExpander(RunExpander):
+class IDEFICSInstructRunExpander(RunExpander):
     """
-    Custom prompt for Google models.
-    These models need more explicit instructions about following the format.
+    Custom prompt for IDEFICS instruct models which require a specific format.
+    See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
     """
-    # TODO: Refactor out common logic between this and OpenAIRunExpander.
+    name = "idefics_instruct"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    input_prefix="User: ",
+                    input_suffix="<end_of_utterance>",
+                    output_prefix="\nAssistant: ",
+                    output_suffix="<end_of_utterance>",
+                    stop_sequences=["<end_of_utterance>"],
+                ),
+            ),
+        ]
-    name = "google"
+class LlavaRunExpander(RunExpander):
+    """
+    Custom prompt for Llava 1.5 models which should use a specific format.
+    See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
+    """
+    name = "llava"
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    input_prefix="USER: <image>",
+                    input_suffix="",
+                    output_prefix="\nASSISTANT: ",
+                    output_suffix="",
+                ),
+            ),
+        ]
+class OpenFlamingoRunExpander(RunExpander):
+    """
+    Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    """
+    name = "open_flamingo"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         return [
             replace(
                 run_spec,
                 name=run_spec.name,
                 adapter_spec=replace(
                     run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
+                    input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
                 ),
             ),
         ]
@@ -415,6 +510,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
         "one": [1],
         "all": [0, 1, 2, 4, 8, 16],  # Cap at 16 due to limited context length
         "big_bench_few_shot_setting": [0, 1, 2, 3],  # Commonly used few-shot setting in BIG-bench
+        "vhelm": [0, 1, 2, 4, 8],
     }
@@ -422,7 +518,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
     """For overriding the number of eval instances at the run level."""
     name = "max_eval_instances"
-    values_dict: Dict[str, List[Any]] = {}
+    values_dict: Dict[str, List[Any]] = {
+        "default": [1_000],
+        "heim_default": [100],
+        "heim_fid": [30_000],
+        "heim_art_styles": [17],
+    }
 class NumOutputsRunExpander(ReplaceValueRunExpander):
@@ -435,6 +536,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
     }
+class NumTrialRunExpander(ReplaceValueRunExpander):
+    """For getting different generations for the same requests."""
+    name = "num_trials"
+    values_dict = {
+        "heim_efficiency": [5],
+    }
 class ModelRunExpander(ReplaceValueRunExpander):
     """
     For specifying different models.
@@ -476,6 +586,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
                 "openai/text-davinci-003",
             ],
             "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
+            "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
             "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
         }
@@ -500,6 +611,33 @@ class ModelDeploymentRunExpander(ReplaceValueRunExpander):
     values_dict: Dict[str, List[Any]] = {}
+class EvalSplitRunExpander(RunExpander):
+    """Sets the evaluation split.
+    By default, evaluation instances are drawn from both test and validation splits.
+    This run expander allows drawing evaluation instances from only the test split or
+    only the validation split."""
+    # NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
+    # run expander name to be "eval_split", not "eval_splits".
+    name = "eval_split"
+    def __init__(self, value):
+        if value != TEST_SPLIT and value != VALID_SPLIT:
+            raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
+        self.split = value
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
+                adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
+            )
+        ]
 ############################################################
@@ -688,6 +826,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
     )
+def translate(language_code: str) -> PerturbationSpec:
+    return PerturbationSpec(
+        class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
+        args={"language_code": language_code},
+    )
+def suffix(text: str) -> PerturbationSpec:
+    return PerturbationSpec(
+        class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
+        args={"suffix": text},
+    )
 # Specifies the data augmentations that we're interested in trying out.
 # Concretely, this is a mapping from the name (which is specified in a conf
 # file or the CLI) to a list of options to try, where each option is a list of perturbations.
@@ -879,6 +1031,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
             mandarin_to_cantonese(),
         ]
     },
+    # Multilinguality
+    "chinese": {"chinese": [translate(language_code="zh-CN")]},
+    "hindi": {"hindi": [translate(language_code="hi")]},
+    "spanish": {"spanish": [translate(language_code="es")]},
+    # Styles
+    "art": {
+        "art": [
+            suffix("oil painting"),
+            suffix("watercolor"),
+            suffix("pencil sketch"),
+            suffix("animation"),
+            suffix("vector graphics"),
+            suffix("pixel art"),
+        ]
+    },
 }
@@ -1219,12 +1386,14 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     NewlineRunExpander,
     StopRunExpander,
     FormatPromptRunExpander,
+    FollowFormatInstructionsRunExpander,
     AddToStopRunExpander,
     GlobalPrefixRunExpander,
     NumTrainTrialsRunExpander,
     MaxTrainInstancesRunExpander,
     MaxEvalInstancesRunExpander,
     NumOutputsRunExpander,
+    NumTrialRunExpander,
     ModelRunExpander,
     ModelDeploymentRunExpander,
     DataAugmentationRunExpander,
@@ -1232,6 +1401,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     NumPromptTokensRunExpander,
     NumOutputTokensRunExpander,
     ChatMLRunExpander,
+    EvalSplitRunExpander,
 ]

helm/benchmark/run_spec.py ADDED Viewed

@@ -0,0 +1,93 @@
+from dataclasses import dataclass, field
+import importlib
+import os
+import pkgutil
+from typing import Callable, Dict, Iterable, List, Optional, TypeVar
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+@dataclass(frozen=True)
+class RunSpec:
+    """
+    Specifies how to do a single run, which gets a scenario, adapts it, and
+    computes a list of stats based on the defined metrics.
+    """
+    name: str
+    """Unique identifier of the RunSpec"""
+    scenario_spec: ScenarioSpec
+    """Which scenario"""
+    adapter_spec: AdapterSpec
+    """Specifies how to adapt an instance into a set of requests"""
+    metric_specs: List[MetricSpec]
+    """What to evaluate on"""
+    data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
+    """Data augmenter. The default `DataAugmenterSpec` does nothing."""
+    groups: List[str] = field(default_factory=list)
+    """Groups that this run spec belongs to (for aggregation)"""
+    annotators: Optional[List[AnnotatorSpec]] = None
+    """Annotators to use for this run spec"""
+    def __post_init__(self):
+        """
+        `self.name` is used as the name of the output folder for the `RunSpec`.
+        Clean up `self.name` by replacing any "/"'s with "_".
+        """
+        # TODO: Don't mutate name! clean this up before passing it into the constructor here
+        object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
+RunSpecFunction = Callable[..., RunSpec]
+_REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
+"""Dict of run spec function names to run spec functions."""
+F = TypeVar("F", bound=RunSpecFunction)
+def run_spec_function(name: str) -> Callable[[F], F]:
+    """Register the run spec function under the given name."""
+    def wrap(func: F) -> F:
+        if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
+            raise ValueError(f"A run spec function with name {name} already exists")
+        _REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
+        return func
+    return wrap
+# Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
+def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
+    # Specifying the second argument (prefix) to iter_modules makes the
+    # returned name an absolute name instead of a relative one. This allows
+    # import_module to work without having to do additional modification to
+    # the name.
+    return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
+def discover_run_spec_functions() -> None:
+    """Discover and register all run spec functions under helm.benchmark.run_specs"""
+    import helm.benchmark.run_specs  # noqa
+    for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
+        importlib.import_module(name)
+def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
+    """Return the run spec function registered under the given name."""
+    discover_run_spec_functions()
+    return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)

helm/benchmark/run_spec_factory.py ADDED Viewed

@@ -0,0 +1,163 @@
+import dataclasses
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_GENERATION,
+    ADAPT_MULTIPLE_CHOICE_JOINT,
+)
+from helm.benchmark.model_deployment_registry import (
+    ModelDeployment,
+    get_default_model_deployment_for_model,
+    get_model_deployment,
+)
+from helm.benchmark.model_metadata_registry import (
+    ANTHROPIC_CLAUDE_1_MODEL_TAG,
+    ANTHROPIC_CLAUDE_2_MODEL_TAG,
+    ANTHROPIC_CLAUDE_3_MODEL_TAG,
+    BUGGY_TEMP_0_TAG,
+    CHATML_MODEL_TAG,
+    GOOGLE_GEMINI_PRO_VISION_V1_TAG,
+    IDEFICS_INSTRUCT_MODEL_TAG,
+    LLAVA_MODEL_TAG,
+    OPEN_FLAMINGO_MODEL_TAG,
+    NLG_PREFIX_TAG,
+    NO_NEWLINES_TAG,
+    VISION_LANGUAGE_MODEL_TAG,
+    IDEFICS_MODEL_TAG,
+    ModelMetadata,
+    get_model_metadata,
+)
+from helm.benchmark.run_expander import (
+    RUN_EXPANDERS,
+    AnthropicClaude2RunExpander,
+    AnthropicClaude3RunExpander,
+    ChatMLRunExpander,
+    GlobalPrefixRunExpander,
+    IDEFICSInstructRunExpander,
+    IncreaseTemperatureRunExpander,
+    IncreaseMaxTokensRunExpander,
+    LlavaRunExpander,
+    OpenFlamingoRunExpander,
+    StopRunExpander,
+)
+from helm.benchmark.run_spec import RunSpec, get_run_spec_function
+from helm.common.general import singleton
+from helm.common.object_spec import ObjectSpec
+def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
+    """
+    Takes a specification (name, args) and returns a list of `RunSpec`s.
+    """
+    # Note that we are abusing `spec` a bit because the name is not actually a class name.
+    name = spec.class_name
+    args = spec.args
+    run_spec_function = get_run_spec_function(name)
+    if run_spec_function is None:
+        raise ValueError(f"Unknown run spec name: {name}")
+    # Peel off the run expanders (e.g., model)
+    expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS]  # type: ignore
+    args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
+    run_specs: List[RunSpec] = [run_spec_function(**args)]
+    # Apply expanders
+    for expander in expanders:
+        run_specs = [
+            child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
+        ]
+    def alter_run_spec(run_spec: RunSpec) -> RunSpec:
+        if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+            raise ValueError("At least one of model_deployment and model must be specified")
+        elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
+            # Infer model from model deployment
+            default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
+            if not default_model_name:
+                default_model_name = run_spec.adapter_spec.model_deployment
+            run_spec = dataclasses.replace(
+                run_spec,
+                adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
+            )
+        elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+            # Infer model deployment from model
+            default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
+            if not default_model_deployment:
+                raise ValueError(
+                    f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
+                )
+            run_spec = dataclasses.replace(
+                run_spec,
+                adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
+            )
+        # Both model and model_deployment should now be filled
+        assert run_spec.adapter_spec.model_deployment
+        assert run_spec.adapter_spec.model
+        model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
+        deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
+        if run_spec.adapter_spec.model != deployment.model_name:
+            raise ValueError(
+                f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
+                f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
+                f"for a different model '{deployment.model_name}'"
+            )
+        # For models that strip newlines, when we're generating, we need to set
+        # the delimiter to be '###' so we stop properly.
+        if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
+            ADAPT_GENERATION,
+            ADAPT_MULTIPLE_CHOICE_JOINT,
+        ):
+            stop_expander = StopRunExpander(value="hash")
+            run_spec = singleton(stop_expander.expand(run_spec))
+        if NLG_PREFIX_TAG in model.tags:
+            global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
+            run_spec = singleton(global_prefix_expander.expand(run_spec))
+        if CHATML_MODEL_TAG in model.tags:
+            chatml_expander = ChatMLRunExpander()
+            run_spec = singleton(chatml_expander.expand(run_spec))
+        # Anthropic Claude 1 and 2 prompts
+        if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
+            run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
+        # Anthropic Claude 3
+        if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
+            run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
+        # Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
+        if (
+            VISION_LANGUAGE_MODEL_TAG in model.tags
+            and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
+            and run_spec.adapter_spec.max_tokens == 1
+        ):
+            run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
+        # IDEFICS special handling
+        if IDEFICS_MODEL_TAG in model.tags:
+            if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
+                run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
+        # Llava
+        if LLAVA_MODEL_TAG in model.tags:
+            run_spec = singleton(LlavaRunExpander().expand(run_spec))
+        # OpenFlamingo
+        if OPEN_FLAMINGO_MODEL_TAG in model.tags:
+            run_spec = singleton(OpenFlamingoRunExpander().expand(run_spec))
+        # For multiple choice
+        if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
+            increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
+            run_spec = singleton(increase_temperature_expander.expand(run_spec))
+        return run_spec
+    run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
+    return run_specs

helm/benchmark/run_specs/__init__.py ADDED Viewed

File without changes

crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl