PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/benchmark/run_expander.py CHANGED Viewed

@@ -3,7 +3,7 @@ from abc import ABC, abstractmethod
 from dataclasses import replace
 from typing import Any, List, Dict, Optional, Tuple, Type
-from helm.proxy.models import (
+from helm.benchmark.model_metadata_registry import (
     get_all_instruction_following_models,
     get_all_code_models,
     get_all_models,
@@ -11,20 +11,17 @@ from helm.proxy.models import (
     get_model_names_with_tag,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-    GPT2_TOKENIZER_TAG,
-    AI21_TOKENIZER_TAG,
-    COHERE_TOKENIZER_TAG,
-    OPT_TOKENIZER_TAG,
-    GPTJ_TOKENIZER_TAG,
-    GPTNEO_TOKENIZER_TAG,
-    GPT4_TOKENIZER_TAG,
     ABLATION_MODEL_TAG,
+    TEXT_TO_IMAGE_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
 )
-from .runner import RunSpec
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
+from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
+from .run_spec import RunSpec
+from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
 from .augmentations.perturbation import PerturbationSpec
 from .augmentations.data_augmenter import DataAugmenterSpec
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
 class RunExpander(ABC):
@@ -227,12 +224,16 @@ class AddToStopRunExpander(RunExpander):
         self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if self.value == "newline":
+            stop_sequence = "\n"
+        else:
+            stop_sequence = self.value
         return [
             replace(
                 run_spec,
                 name=run_spec.name,
                 adapter_spec=replace(
-                    run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [self.value]
+                    run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
                 ),
             ),
         ]
@@ -261,6 +262,238 @@ class GlobalPrefixRunExpander(RunExpander):
         ]
+# Instruction-following models like GPT-4, Claude, PaLM 2 don't do in-context
+# learning naturally like base models, and they prefer to respond in a wordy
+# way as an assistant.  Therefore, for these models, we must provide explicit
+# instructions to follow the format of the in-context examples.
+IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX = (
+    "Here are some input-output examples. "
+    + "Read the examples carefully to figure out the mapping. "
+    + "The output of the last example is not given, "
+    + "and your job is to figure out what it is."
+)
+IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
+    "Please provide the output to this last example. " + "It is critical to follow the format of the preceding outputs!"
+)
+class AnthropicClaude2RunExpander(RunExpander):
+    """
+    Custom prompt for Anthropic Claude 1 and Claude 2 models.
+    These models need more explicit instructions about following the format.
+    """
+    name = "anthropic"
+    # These strings must be added to the prompt in order to pass prompt validation,
+    # otherwise the Anthropic API will return an error.
+    # See: https://docs.anthropic.com/claude/reference/prompt-validation
+    HUMAN_PROMPT = "\n\nHuman:"
+    AI_PROMPT = "\n\nAssistant:"
+    def __init__(self):
+        pass
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
+                    + " "
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
+                    + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + AnthropicClaude2RunExpander.AI_PROMPT
+                    + " "
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+class AnthropicClaude3RunExpander(RunExpander):
+    """Custom prompts for Anthropic Claude 3 models."""
+    name = "claude_3"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
+            instructions = "Answer with only a single letter."
+            if run_spec.adapter_spec.instructions:
+                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                ),
+            ]
+        return [run_spec]
+class OpenAIRunExpander(RunExpander):
+    """
+    Custom prompt for OpenAI models.
+    These models need more explicit instructions about following the format.
+    """
+    # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
+    name = "openai"
+    def __init__(self):
+        pass
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+            return [run_spec]
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + "\n"
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+class GoogleRunExpander(RunExpander):
+    """
+    Custom prompt for Google models.
+    These models need more explicit instructions about following the format.
+    """
+    # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
+    name = "google"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+            return [run_spec]
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + "\n"
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+class MistralRunExpander(RunExpander):
+    """Custom prompt for Mistral models."""
+    # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
+    name = "output_format_instructions"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+            return [run_spec]
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + "\n"
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+class IDEFICSInstructRunExpander(RunExpander):
+    """
+    Custom prompt for IDEFICS instruct models which require a specific format.
+    See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
+    """
+    name = "idefics_instruct"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    input_prefix="User: ",
+                    input_suffix="<end_of_utterance>",
+                    output_prefix="\nAssistant: ",
+                    output_suffix="<end_of_utterance>",
+                    stop_sequences=["<end_of_utterance>"],
+                ),
+            ),
+        ]
+class LlavaRunExpander(RunExpander):
+    """
+    Custom prompt for Llava 1.5 models which should use a specific format.
+    See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
+    """
+    name = "llava"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    input_prefix="USER: <image>",
+                    input_suffix="",
+                    output_prefix="\nASSISTANT: ",
+                    output_suffix="",
+                ),
+            ),
+        ]
+class OpenFlamingoRunExpander(RunExpander):
+    """
+    Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    """
+    name = "open_flamingo"
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
+                ),
+            ),
+        ]
 class FormatPromptRunExpander(RunExpander):
     """Adds a prefix and suffix to the prompt."""
@@ -277,7 +510,7 @@ class FormatPromptRunExpander(RunExpander):
                 name=run_spec.name,
                 adapter_spec=replace(
                     run_spec.adapter_spec,
-                    global_prefix=self.prefix,
+                    input_prefix=self.prefix,
                     output_prefix=self.suffix,
                 ),
             ),
@@ -306,6 +539,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
         "one": [1],
         "all": [0, 1, 2, 4, 8, 16],  # Cap at 16 due to limited context length
         "big_bench_few_shot_setting": [0, 1, 2, 3],  # Commonly used few-shot setting in BIG-bench
+        "heim_human_eval": [0, 1, 2, 4, 8],
     }
@@ -313,7 +547,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
     """For overriding the number of eval instances at the run level."""
     name = "max_eval_instances"
-    values_dict: Dict[str, List[Any]] = {}
+    values_dict: Dict[str, List[Any]] = {
+        "default": [1_000],
+        "heim_default": [100],
+        "heim_fid": [30_000],
+        "heim_art_styles": [17],
+    }
 class NumOutputsRunExpander(ReplaceValueRunExpander):
@@ -326,6 +565,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
     }
+class NumTrialRunExpander(ReplaceValueRunExpander):
+    """For getting different generations for the same requests."""
+    name = "num_trials"
+    values_dict = {
+        "heim_efficiency": [5],
+    }
 class ModelRunExpander(ReplaceValueRunExpander):
     """
     For specifying different models.
@@ -355,10 +603,6 @@ class ModelRunExpander(ReplaceValueRunExpander):
             "code": get_all_code_models(),
             "instruction_following": get_all_instruction_following_models(),
             "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
-            "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
-            "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
-            "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
-            "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
             "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
             "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
             "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
@@ -371,6 +615,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
                 "openai/text-davinci-003",
             ],
             "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
+            "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
             "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
         }
@@ -388,6 +633,40 @@ class ModelRunExpander(ReplaceValueRunExpander):
         return values_dict
+class ModelDeploymentRunExpander(ReplaceValueRunExpander):
+    """For overriding model deployment"""
+    name = "model_deployment"
+    values_dict: Dict[str, List[Any]] = {}
+class EvalSplitRunExpander(RunExpander):
+    """Sets the evaluation split.
+    By default, evaluation instances are drawn from both test and validation splits.
+    This run expander allows drawing evaluation instances from only the test split or
+    only the validation split."""
+    # NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
+    # run expander name to be "eval_split", not "eval_splits".
+    name = "eval_split"
+    def __init__(self, value):
+        if value != TEST_SPLIT and value != VALID_SPLIT:
+            raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
+        self.split = value
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
+                adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
+            )
+        ]
 ############################################################
@@ -576,6 +855,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
     )
+def translate(language_code: str) -> PerturbationSpec:
+    return PerturbationSpec(
+        class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
+        args={"language_code": language_code},
+    )
+def suffix(text: str) -> PerturbationSpec:
+    return PerturbationSpec(
+        class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
+        args={"suffix": text},
+    )
 # Specifies the data augmentations that we're interested in trying out.
 # Concretely, this is a mapping from the name (which is specified in a conf
 # file or the CLI) to a list of options to try, where each option is a list of perturbations.
@@ -767,6 +1060,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
             mandarin_to_cantonese(),
         ]
     },
+    # Multilinguality
+    "chinese": {"chinese": [translate(language_code="zh-CN")]},
+    "hindi": {"hindi": [translate(language_code="hi")]},
+    "spanish": {"spanish": [translate(language_code="es")]},
+    # Styles
+    "art": {
+        "art": [
+            suffix("oil painting"),
+            suffix("watercolor"),
+            suffix("pencil sketch"),
+            suffix("animation"),
+            suffix("vector graphics"),
+            suffix("pixel art"),
+        ]
+    },
 }
@@ -880,18 +1188,18 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
         "huggingface/santacoder": ["bigcode/santacoder"],
         "huggingface/starcoder": ["bigcode/starcoder"],
     }
-    model_tags_and_tokenizers = [
-        (GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
-        (AI21_TOKENIZER_TAG, "ai21/j1"),
-        (COHERE_TOKENIZER_TAG, "cohere/cohere"),
-        (OPT_TOKENIZER_TAG, "meta/opt"),
-        (GPTJ_TOKENIZER_TAG, "eleutherai/gptj"),
-        (GPT4_TOKENIZER_TAG, "openai/cl100k_base"),
-        (GPTNEO_TOKENIZER_TAG, "eleutherai/gptneox"),
+    list_tokenizers = [
+        "huggingface/gpt2",
+        "ai21/j1",
+        "cohere/cohere",
+        "meta/opt",
+        "eleutherai/gptj",
+        "openai/cl100k_base",
+        "eleutherai/gptneox",
     ]
-    for model_tag, tokenizer in model_tags_and_tokenizers:
-        for model in get_model_names_with_tag(model_tag):
-            model_to_tokenizer_mapping[model] = [tokenizer]
+    for tokenizer_name in list_tokenizers:
+        for model in get_model_names_with_tokenizer(tokenizer_name):
+            model_to_tokenizer_mapping[model] = [tokenizer_name]
     # tokenizer=default will map to using the right tokenizer for a given model.
     values_dict = {"default": model_to_tokenizer_mapping}
@@ -907,10 +1215,10 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
             self.all_values = [value]
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        # Find right tokenizer given model.
+        # Find right tokenizer given model deployment name.
         if isinstance(self.all_values, dict):
-            model: str = run_spec.adapter_spec.model
-            self.values = self.all_values[model] if model in self.all_values else []
+            deployment: str = run_spec.adapter_spec.model_deployment
+            self.values = self.all_values[deployment] if deployment in self.all_values else []
         else:
             self.values = self.all_values
         return super().expand(run_spec)
@@ -1113,12 +1421,15 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     MaxTrainInstancesRunExpander,
     MaxEvalInstancesRunExpander,
     NumOutputsRunExpander,
+    NumTrialRunExpander,
     ModelRunExpander,
+    ModelDeploymentRunExpander,
     DataAugmentationRunExpander,
     TokenizerRunExpander,
     NumPromptTokensRunExpander,
     NumOutputTokensRunExpander,
     ChatMLRunExpander,
+    EvalSplitRunExpander,
 ]

helm/benchmark/run_spec.py ADDED Viewed

@@ -0,0 +1,93 @@
+from dataclasses import dataclass, field
+import importlib
+import os
+import pkgutil
+from typing import Callable, Dict, Iterable, List, Optional, TypeVar
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+@dataclass(frozen=True)
+class RunSpec:
+    """
+    Specifies how to do a single run, which gets a scenario, adapts it, and
+    computes a list of stats based on the defined metrics.
+    """
+    name: str
+    """Unique identifier of the RunSpec"""
+    scenario_spec: ScenarioSpec
+    """Which scenario"""
+    adapter_spec: AdapterSpec
+    """Specifies how to adapt an instance into a set of requests"""
+    metric_specs: List[MetricSpec]
+    """What to evaluate on"""
+    data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
+    """Data augmenter. The default `DataAugmenterSpec` does nothing."""
+    groups: List[str] = field(default_factory=list)
+    """Groups that this run spec belongs to (for aggregation)"""
+    annotators: Optional[List[AnnotatorSpec]] = None
+    """Annotators to use for this run spec"""
+    def __post_init__(self):
+        """
+        `self.name` is used as the name of the output folder for the `RunSpec`.
+        Clean up `self.name` by replacing any "/"'s with "_".
+        """
+        # TODO: Don't mutate name! clean this up before passing it into the constructor here
+        object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
+RunSpecFunction = Callable[..., RunSpec]
+_REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
+"""Dict of run spec function names to run spec functions."""
+F = TypeVar("F", bound=RunSpecFunction)
+def run_spec_function(name: str) -> Callable[[F], F]:
+    """Register the run spec function under the given name."""
+    def wrap(func: F) -> F:
+        if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
+            raise ValueError(f"A run spec function with name {name} already exists")
+        _REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
+        return func
+    return wrap
+# Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
+def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
+    # Specifying the second argument (prefix) to iter_modules makes the
+    # returned name an absolute name instead of a relative one. This allows
+    # import_module to work without having to do additional modification to
+    # the name.
+    return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
+def discover_run_spec_functions() -> None:
+    """Discover and register all run spec functions under helm.benchmark.run_specs"""
+    import helm.benchmark.run_specs  # noqa
+    for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
+        importlib.import_module(name)
+def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
+    """Return the run spec function registered under the given name."""
+    discover_run_spec_functions()
+    return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl