PyPI - crfm-helm - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show

{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/perturbation.py +17 -1
helm/benchmark/augmentations/test_perturbation.py +30 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/efficiency_metrics.py +9 -2
helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
helm/benchmark/model_metadata_registry.py +5 -1
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +112 -63
helm/benchmark/run_spec_factory.py +15 -10
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +444 -65
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/legalbench_scenario.py +6 -2
helm/benchmark/scenarios/math_scenario.py +1 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +447 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/schema_vhelm.yaml +824 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +78 -14
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +11 -5
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +199 -2
helm/clients/vertexai_client.py +117 -64
helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
helm/clients/vision_language/huggingface_vlm_client.py +12 -4
helm/clients/vision_language/idefics_client.py +2 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +29 -3
helm/config/model_deployments.yaml +504 -12
helm/config/model_metadata.yaml +579 -52
helm/config/tokenizer_configs.yaml +100 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/schema_vlm.yaml +0 -576
helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
helm/benchmark/static_build/assets/index-d839df55.js +0 -9
helm/benchmark/test_model_deployment_definition.py +0 -90
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -1,6 +1,9 @@
+import ast
+import dataclasses
 from dataclasses import dataclass, field
 from typing import List, Optional, Dict
 import dacite
+from inspect import cleandoc
 import mako.template
 import yaml
 import importlib_resources as resources
@@ -17,6 +20,11 @@ SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
 SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
+_ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
+_ADAPTER_SPEC_FILENAME = "adapter_spec.py"
+_ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
 @dataclass(frozen=True)
 class Field:
     """
@@ -198,9 +206,6 @@ class RunGroup(Field):
 class Schema:
     """Specifies information about what to display on the frontend."""
-    # Adapter fields (e.g., temperature)
-    adapter: List[Field]
     # Information about each field
     metrics: List[Field]
@@ -213,6 +218,11 @@ class Schema:
     # Group the scenarios
     run_groups: List[RunGroup]
+    # Adapter fields (e.g., temperature)
+    # Automatically populated from the docstrings in the AdapterSpec class definition.
+    # Should not be specified in the user's YAML file.
+    adapter: Optional[List[Field]] = None
     def __post_init__(self):
         self.name_to_metric = {metric.name: metric for metric in self.metrics}
         self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
@@ -220,6 +230,43 @@ class Schema:
         self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
+def get_adapter_fields() -> List[Field]:
+    """Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
+    # Unfortunately there is no standard library support for getting docstrings of class fields,
+    # so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
+    adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
+    with open(adapter_spec_path, "r") as f:
+        contents = f.read()
+    module_node = ast.parse(contents)
+    adapter_spec_node = [
+        node
+        for node in ast.iter_child_nodes(module_node)
+        if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
+    ][0]
+    metadata_fields: List[Field] = []
+    field_name: str = ""
+    for node in ast.iter_child_nodes(adapter_spec_node):
+        if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
+            # This node is a field definition.
+            # Save the name of the field for later.
+            field_name = node.target.id
+        else:
+            # If this is a docstring that immediately follows a field definition,
+            # output an adapter field with the name set to  the field definition and
+            # the description set to the docstring.
+            if (
+                field_name
+                and isinstance(node, ast.Expr)
+                and isinstance(node.value, ast.Constant)
+                and isinstance(node.value.value, str)
+            ):
+                description = cleandoc(node.value.value).replace("\n", " ")
+                metadata_fields.append(Field(name=field_name, description=description))
+            field_name = ""
+    return metadata_fields
 def get_default_schema_path() -> str:
     return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
     hlog(f"Reading schema file {schema_path}...")
     with open(schema_path, "r") as f:
         raw = yaml.safe_load(f)
-    return dacite.from_dict(Schema, raw)
+    schema = dacite.from_dict(Schema, raw)
+    if schema.adapter:
+        hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
+    return dataclasses.replace(schema, adapter=get_adapter_fields())

helm/benchmark/presentation/test_schema.py ADDED Viewed

@@ -0,0 +1,11 @@
+from helm.benchmark.presentation.schema import get_adapter_fields
+def test_get_adapter_fields() -> None:
+    adapter_fields = get_adapter_fields()
+    assert adapter_fields
+    assert adapter_fields[0].name == "method"
+    assert (
+        adapter_fields[0].description
+        == "The high-level strategy for converting instances into a prompt for the language model."
+    )

helm/benchmark/run.py CHANGED Viewed

@@ -264,6 +264,13 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
+    parser.add_argument(
+        "--openvino",
+        action="store_true",
+        default=False,
+        help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
+        "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
+    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
@@ -275,12 +282,19 @@ def main():
         from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
         for huggingface_model_name in args.enable_huggingface_models:
-            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+            if args.openvino:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
+            else:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name)
     if args.enable_local_huggingface_models:
         from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
         for huggingface_model_path in args.enable_local_huggingface_models:
-            register_huggingface_local_model_from_flag_value(huggingface_model_path)
+            if args.openvino:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
+            else:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path)
     run_entries: List[RunEntry] = []
     if args.conf_paths:

helm/benchmark/run_expander.py CHANGED Viewed

@@ -8,12 +8,14 @@ from helm.benchmark.model_metadata_registry import (
     get_all_code_models,
     get_all_models,
     get_all_text_models,
+    get_model_metadata,
     get_model_names_with_tag,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
     TEXT_TO_IMAGE_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
+    INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
@@ -192,6 +194,15 @@ class StopRunExpander(RunExpander):
         self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if self.value == "none":
+            return [
+                replace(
+                    run_spec,
+                    name=f"{run_spec.name},{self.name}={self.value}",
+                    adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
+                ),
+            ]
         if self.value == "hash":
             stop = "###"
         elif self.value == "semicolon":
@@ -322,6 +333,16 @@ class AnthropicClaude3RunExpander(RunExpander):
     name = "claude_3"
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        # Remove all stop sequences that do not contain non-whitespace characters.
+        # This prevents the Anthropic API from returnin the following error:
+        # "stop_sequences: each stop sequence must contain non-whitespace"
+        stop_sequences_with_non_whitespace = [
+            stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
+        ]
+        run_spec = replace(
+            run_spec,
+            adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
+        )
         if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
             instructions = "Answer with only a single letter."
             if run_spec.adapter_spec.instructions:
@@ -335,78 +356,37 @@ class AnthropicClaude3RunExpander(RunExpander):
         return [run_spec]
-class OpenAIRunExpander(RunExpander):
-    """
-    Custom prompt for OpenAI models.
-    These models need more explicit instructions about following the format.
-    """
+class FollowFormatInstructionsRunExpander(RunExpander):
+    """Adds more explicit instructions about following the format to prompts.
-    # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
-    name = "openai"
-    def __init__(self):
-        pass
-    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
-        return [
-            replace(
-                run_spec,
-                name=run_spec.name,
-                adapter_spec=replace(
-                    run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
-                ),
-            ),
-        ]
+    The argument controlls which models will receive these instructions.
+    If "all", all models receive these instructions.
+    If "instruct", only instruction-following models receive these instructions.
+    Only supports the generation adaptation method. Raises an error if used on
+    a RunSpec that uses a different adaptation method.
-class GoogleRunExpander(RunExpander):
-    """
-    Custom prompt for Google models.
-    These models need more explicit instructions about following the format.
+    Note: For legacy backwards compatibility reasons, despite the use of the word
+    "instructions" in this run expander's name, this run expander actually
+    modifies the global_prefix and the global_suffix of the AdapterSpec rather than
+    the instructions.
     """
-    # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
+    name = "follow_format_instructions"
-    name = "google"
+    def __init__(self, value: str):
+        if value != "all" and value != "instruct":
+            raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
+        self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
-        return [
-            replace(
-                run_spec,
-                name=run_spec.name,
-                adapter_spec=replace(
-                    run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
-                ),
-            ),
-        ]
+            raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
-class MistralRunExpander(RunExpander):
-    """Custom prompt for Mistral models."""
-    # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
-    name = "output_format_instructions"
-    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+        if (
+            self.value == "instruct"
+            and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
+        ):
             return [run_spec]
         return [
@@ -539,7 +519,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
         "one": [1],
         "all": [0, 1, 2, 4, 8, 16],  # Cap at 16 due to limited context length
         "big_bench_few_shot_setting": [0, 1, 2, 3],  # Commonly used few-shot setting in BIG-bench
-        "heim_human_eval": [0, 1, 2, 4, 8],
+        "vhelm": [0, 1, 2, 4, 8],
     }
@@ -1064,6 +1044,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
     "chinese": {"chinese": [translate(language_code="zh-CN")]},
     "hindi": {"hindi": [translate(language_code="hi")]},
     "spanish": {"spanish": [translate(language_code="es")]},
+    "swahili": {"swahili": [translate(language_code="sw")]},
     # Styles
     "art": {
         "art": [
@@ -1409,12 +1390,79 @@ class ChatMLRunExpander(RunExpander):
         ]
+class OutputFormatInstructions(RunExpander):
+    """Add extra instructions to about output formatting to HELM Lite scenarios.
+    Many instruction-following models and chat models are tuned to expect conversational prompts
+    and respond in a conversational way. These models occasionally produce outputs that are not
+    in the expected format. This run expander instructs these models to provide the output in
+    the format expected by the scenario.
+    The argument should be the name of the scenario."""
+    name = "output_format_instructions"
+    def __init__(self, scenario: str):
+        self.scenario = scenario
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
+            if self.scenario == "mmlu_only_last_question":
+                instructions = "Answer only the last question with only a single letter."
+            else:
+                instructions = "Answer with only a single letter."
+            if run_spec.adapter_spec.instructions:
+                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                ),
+            ]
+        elif run_spec.adapter_spec.method == ADAPT_GENERATION:
+            output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
+            if self.scenario == "narrative_qa":
+                instructions = (
+                    "Answer with one word, a few-word phrase, or a short sentence. "
+                    + "Avoid extra, unnecessary information in the answer."
+                )
+            elif self.scenario == "natural_qa":
+                instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
+            elif self.scenario == "legalbench":
+                if output_noun != "Answer":
+                    instructions = f"Answer with the {output_noun.lower()}."
+                else:
+                    instructions = "Answer yes or no."
+            elif self.scenario == "wmt_14":
+                instructions = "Answer with the English translation."
+            else:
+                raise ValueError(f"Unknown scenario {self.scenario}")
+            if run_spec.adapter_spec.output_prefix:
+                instructions = (
+                    f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
+                )
+            if run_spec.adapter_spec.instructions:
+                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+            else:
+                instructions = f"{instructions}\n"
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                ),
+            ]
+        raise ValueError(f"Unknown scenario {self.scenario}")
 RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     InstructionsRunExpander,
     PromptRunExpander,
     NewlineRunExpander,
     StopRunExpander,
     FormatPromptRunExpander,
+    FollowFormatInstructionsRunExpander,
     AddToStopRunExpander,
     GlobalPrefixRunExpander,
     NumTrainTrialsRunExpander,
@@ -1430,6 +1478,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     NumOutputTokensRunExpander,
     ChatMLRunExpander,
     EvalSplitRunExpander,
+    OutputFormatInstructions,
 ]

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import List
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
     ADAPT_MULTIPLE_CHOICE_JOINT,
-    ADAPT_GENERATION_MULTIMODAL,
 )
 from helm.benchmark.model_deployment_registry import (
     ModelDeployment,
@@ -14,22 +13,24 @@ from helm.benchmark.model_deployment_registry import (
 from helm.benchmark.model_metadata_registry import (
     ANTHROPIC_CLAUDE_1_MODEL_TAG,
     ANTHROPIC_CLAUDE_2_MODEL_TAG,
+    ANTHROPIC_CLAUDE_3_MODEL_TAG,
     BUGGY_TEMP_0_TAG,
     CHATML_MODEL_TAG,
-    GOOGLE_GEMINI_MODEL_TAG,
+    GOOGLE_GEMINI_PRO_VISION_V1_TAG,
     IDEFICS_INSTRUCT_MODEL_TAG,
-    IDEFICS_MODEL_TAG,
     LLAVA_MODEL_TAG,
     OPEN_FLAMINGO_MODEL_TAG,
-    VISION_LANGUAGE_MODEL_TAG,
     NLG_PREFIX_TAG,
     NO_NEWLINES_TAG,
+    VISION_LANGUAGE_MODEL_TAG,
+    IDEFICS_MODEL_TAG,
     ModelMetadata,
     get_model_metadata,
 )
 from helm.benchmark.run_expander import (
     RUN_EXPANDERS,
     AnthropicClaude2RunExpander,
+    AnthropicClaude3RunExpander,
     ChatMLRunExpander,
     GlobalPrefixRunExpander,
     IDEFICSInstructRunExpander,
@@ -125,20 +126,20 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
         if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
             run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
-        # Google Gemini Vision returns an empty completion or throws an error if max_tokens is 1
+        # Anthropic Claude 3
+        if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
+            run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
+        # Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
         if (
             VISION_LANGUAGE_MODEL_TAG in model.tags
-            and GOOGLE_GEMINI_MODEL_TAG in model.tags
+            and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
             and run_spec.adapter_spec.max_tokens == 1
         ):
             run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
         # IDEFICS special handling
         if IDEFICS_MODEL_TAG in model.tags:
-            # IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
-            if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
-                run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
             if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
                 run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
@@ -155,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
             increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
             run_spec = singleton(increase_temperature_expander.expand(run_spec))
+        # MedLM-Large
+        if run_spec.adapter_spec.model == "google/medlm-large":
+            run_spec = singleton(StopRunExpander("none").expand(run_spec))
         return run_spec
     run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]

helm/benchmark/run_specs/air_bench_run_specs.py ADDED Viewed

@@ -0,0 +1,40 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("air_bench_2024")
+def get_air_bench_2024_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="air_bench_2024",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["air_bench_2024"],
+    )

helm/benchmark/run_specs/classic_run_specs.py CHANGED Viewed

@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
     get_ranking_binary_adapter_spec,
     get_summarization_adapter_spec,
 )
+from helm.benchmark.annotation.annotator import AnnotatorSpec
 from helm.benchmark.metrics.common_metric_specs import (
     get_basic_metric_specs,
     get_bias_metric_specs,
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
 @run_spec_function("live_qa")
 def get_live_qa_spec() -> RunSpec:
-    from helm.common.gpu_utils import get_torch_device_name
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
     adapter_spec = get_generation_adapter_spec(
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
         max_train_instances=0,
         max_tokens=512,
     )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
+    metric_specs = get_open_ended_generation_metric_specs() + [
+        MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
+    ]
     return RunSpec(
         name="live_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(
-            {"task": "live_qa", "device": get_torch_device_name()},
-        ),
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
         groups=["live_qa"],
     )
 @run_spec_function("medication_qa")
 def get_medication_qa_spec() -> RunSpec:
-    from helm.common.gpu_utils import get_torch_device_name
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
     adapter_spec = get_generation_adapter_spec(
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
         max_tokens=512,
     )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
+    ]
+    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
     return RunSpec(
         name="medication_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(
-            {"task": "medication_qa", "device": get_torch_device_name()},
-        ),
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
         groups=["medication_qa"],
     )
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["thai_exam"],
+        groups=["thai_exam", f"thai_exam_{exam}"],
     )

helm/benchmark/run_specs/decodingtrust_run_specs.py CHANGED Viewed

@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
         name="decodingtrust_toxicity_prompts",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
+        metric_specs=get_generative_harms_metric_specs(
+            include_basic_metrics=True, include_generative_harms_metrics=True
+        ),
         groups=["decodingtrust", "toxicity_prompts"],
     )

helm/benchmark/run_specs/experimental_run_specs.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Run specs for experiments only.
+These run specs are not intended for use with public leaderboards."""
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("ci_mcqa")
+def get_ci_mcqa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=(
+            "Give a letter answer among the options given. "
+            "For example, if the options are A, B, C, D, E, and F, "
+            "your answer should consist of the single letter that corresponds to the correct answer."
+        ),
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="ci_mcqa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["CIMCQA"],
+    )

helm/benchmark/run_specs/finance_run_specs.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Run spec functions for the HELM Finance leaderboard.
+Website: https://crfm.stanford.edu/helm/finance/"""
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_generation_adapter_spec,
+)
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+)
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("fin_qa")
+def get_fin_qa_spec() -> RunSpec:
+    from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
+    adapter_spec = get_generation_adapter_spec(
+        instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
+    )
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
+    ]
+    return RunSpec(
+        name="fin_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["fin_qa"],
+    )

crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl