PyPI - crfm-helm - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show

{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/perturbation.py +17 -1
helm/benchmark/augmentations/test_perturbation.py +30 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/efficiency_metrics.py +9 -2
helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
helm/benchmark/model_metadata_registry.py +5 -1
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +112 -63
helm/benchmark/run_spec_factory.py +15 -10
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +444 -65
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/legalbench_scenario.py +6 -2
helm/benchmark/scenarios/math_scenario.py +1 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +447 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/schema_vhelm.yaml +824 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +78 -14
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +11 -5
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +199 -2
helm/clients/vertexai_client.py +117 -64
helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
helm/clients/vision_language/huggingface_vlm_client.py +12 -4
helm/clients/vision_language/idefics_client.py +2 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +29 -3
helm/config/model_deployments.yaml +504 -12
helm/config/model_metadata.yaml +579 -52
helm/config/tokenizer_configs.yaml +100 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/schema_vlm.yaml +0 -576
helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
helm/benchmark/static_build/assets/index-d839df55.js +0 -9
helm/benchmark/test_model_deployment_definition.py +0 -90
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -39,90 +39,91 @@ class AdapterSpec:
     Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
     """
-    # Method of adaptation
     method: str = ""
+    """The high-level strategy for converting instances into a prompt for the language model."""
-    # Prepend all prompts with this string.
-    # For example, it is recommended to prefix all prompts with [NLG] for UL2.
     global_prefix: str = ""
+    """The string that is prepended to the entire prompt."""
-    # Append all prompts with this string.
     global_suffix: str = ""
+    """The string that is appended to the entire prompt."""
-    # Prompt starts with instructions
     instructions: str = ""
+    """The description of the task that is included at the very beginning of the prompt."""
-    # What goes before the input
     input_prefix: str = "Input: "
+    """The string that is included before each input (e.g., 'Question:')."""
-    # What goes after the input
     input_suffix: str = "\n"
+    """The string that is included after each input (e.g., '\\n')."""
-    # What goes before the input (for multiple choice)
     reference_prefix: str = "A. "
+    """The string that is included before each reference (for multiple-choice questions)."""
-    # What goes before the input (for multiple choice)
     reference_suffix: str = "\n"
+    """The string that is included after each reference (for multiple-choice questions)."""
-    # What goes before the output
     output_prefix: str = "Output: "
+    """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
-    # What goes after the output
     output_suffix: str = "\n"
+    """The string that is included after the correct answer/predicted output (e.g., '\\n')."""
-    # What goes between instruction and in-context example blocks in the constructed prompt
     instance_prefix: str = "\n"
+    """The string that is included before each instance (e.g., '\\n\\n')."""
-    # List of regular expression substitutions that we perform
     substitutions: List[Substitution] = field(default_factory=list, hash=False)
+    """A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
+    to perform at the very end on the prompt."""
-    # Maximum number of (in-context) training instances to put into the prompt
     max_train_instances: int = 5
+    """Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
-    # Maximum number of evaluation instances. For getting valid numbers, this
-    # should be the entire dataset; only reduce this for piloting.
     max_eval_instances: Optional[int] = None
+    """Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
-    # Generate this many outputs (which could be realized by `num_completions`
-    # or `top_k_per_token`).
     num_outputs: int = 5
+    """Maximum number of possible outputs to generate by sampling multiple outputs."""
-    # Number of trials, where in each trial we choose an independent, random
-    # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    """Number of trials, where in each trial we choose an independent, random set of training instances.
+    Used to compute variance."""
-    # Number of trials, where we query the model with the same requests, but different random seeds
     num_trials: int = 1
+    """Number of trials, where we query the model with the same requests, but different random seeds."""
-    # If true, randomly sample N training examples; if false, select N consecutive training examples
     sample_train: bool = True
+    """If true, randomly sample N training examples; if false, select N consecutive training examples"""
     # Decoding parameters (inherited by `Request`)
-    # Model deployment to make the request to (need to fill in)
     model_deployment: str = ""
+    """Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
-    # Model to make the request to
     model: str = ""
+    """Name of the language model (<creator_organization>/<model name>) to send requests to."""
-    # Temperature to use
     temperature: float = 1
+    """Temperature parameter used in generation."""
-    # Maximum number of tokens to generate
     max_tokens: int = 100
+    """Maximum number of tokens to generate."""
-    # When to stop (set hash=False to make `AdapterSpec` hashable)
+    # Set hash=False to make `AdapterSpec` hashable
     stop_sequences: List[str] = field(default_factory=list, hash=False)
+    """List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
     # Random string (used concretely to bypass cache / see diverse results)
     random: Optional[str] = None
+    """Random seed (string), which guarantees reproducibility."""
-    # If true, for instances with multiple correct reference, the gold answer should be considered
-    # to be all the correct references rather than any of the correct references.
     multi_label: bool = False
+    """If true, for instances with multiple correct reference, the gold answer should be considered to be all
+    of the correct references rather than any of the correct references."""
-    # Parameters for image generation
     image_generation_parameters: Optional[ImageGenerationParameters] = None
+    """Parameters for image generation."""
-    # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
+    # Set hash=False to make `AdapterSpec` hashable
     eval_splits: Optional[List[str]] = field(default=None, hash=False)
+    """The splits from which evaluation instances will be drawn."""

helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py CHANGED Viewed

@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
         # Prompt
         prompt = MultimodalPrompt(
             global_prefix=self.adapter_spec.global_prefix,
+            global_suffix=self.adapter_spec.global_suffix,
             instructions=self.adapter_spec.instructions,
             train_instance_blocks=train_instance_blocks,
             eval_instance_block=eval_instance_block,

helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py CHANGED Viewed

@@ -11,6 +11,9 @@ class MultimodalPrompt:
     # Global prefix, carried over from `AdapterSpec`
     global_prefix: str
+    # Global suffix, carried over from `AdapterSpec`
+    global_suffix: str
     # Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
     instance_prefix: str
@@ -47,6 +50,10 @@ class MultimodalPrompt:
         if self.global_prefix:
             result = result.add_textual_prefix(self.global_prefix)
+        # Add the global prefix if one exists
+        if self.global_suffix:
+            result = result.add_textual_suffix(self.global_suffix)
         return result
     @property

helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py CHANGED Viewed

@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
         prompt = MultimodalPrompt(
             global_prefix="[START]",
+            global_suffix="",
             instance_prefix="\n",
             instructions="Please answer the following questions about the images.",
             train_instance_blocks=train_instance_blocks,
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
         prompt = MultimodalPrompt(
             global_prefix="",
+            global_suffix="",
             instance_prefix="\n",
             instructions="",
             train_instance_blocks=[],

helm/benchmark/annotation/air_bench_annotator.py ADDED Viewed

@@ -0,0 +1,64 @@
+import datasets
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class AIRBench2024Annotator(Annotator):
+    """The AIRBench 2024 autograder."""
+    name = "air_bench_2024"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "stanford-crfm/air-bench-2024", "judge_prompts", split="test", cache_dir=cache_dir
+        )
+        self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        category_id = request_state.instance.references[0].output.text
+        prompt_template = self._category_id_to_judge_prompt[category_id]
+        # Strip to deal with incorrectly formatted input CSV.
+        # TODO: Stop stripping after CSV is fixed.
+        annotator_prompt = prompt_template.replace("{{QUESTION}}", model_input_text).replace(
+            "{{ANSWER}}", model_output_text
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

helm/benchmark/annotation/annotator_factory.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Mapping, Optional
+from helm.clients.auto_client import AutoClient
 from helm.common.credentials_utils import provide_api_key
 from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
 from helm.common.hierarchical_logger import hlog
@@ -46,6 +47,11 @@ class AnnotatorFactory:
             provider_bindings={
                 "api_key": lambda: provide_api_key(self.credentials, annotator_name),
                 "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
+                "auto_client": lambda: AutoClient(
+                    credentials=self.credentials,
+                    file_storage_path=self.file_storage_path,
+                    cache_backend_config=self.cache_backend_config,
+                ),
             },
         )
         annotator = create_object(annotator_spec)

helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py CHANGED Viewed

@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
     """Annotator that compiles the text completions into a music sheet with LilyPond."""
     name: str = "lilypond_compiler"
-    base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
+    base_path = "lilypond-2.24.3/bin"
     def __init__(self, cache_config: CacheConfig, file_storage_path: str):
         super().__init__(cache_config, file_storage_path)

helm/benchmark/annotation/live_qa_annotator.py ADDED Viewed

@@ -0,0 +1,84 @@
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class LiveQAAnnotator(Annotator):
+    """The LiveQA autograder."""
+    name = "live_qa"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        correct_responses = ""
+        for i, reference in enumerate(request_state.instance.references):
+            correct_responses += "Correct Response #" + str(i) + ": " + reference.output.text + "\n"
+        prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
+The user's request will be provided in these tags:
+<user_request>
+{{QUESTION}}
+</user_request>
+The model's response will be provided in these tags:
+<model_response>
+{{ANSWER}}
+</model_response>
+Some potential correct responses will be provided in these tags:
+<correct_responses>
+{{CORRECT_RESPONSES}}
+</correct_responses>
+Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
+# 1, if the model response fully matches the correct response content and intent for at least one correct response
+# 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
+# 0.3, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
+# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
+Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
+Format:
+##short_reasoning: <your concise and brief reasoning>
+##the_score: <your score>"""  # noqa: E501
+        annotator_prompt = (
+            prompt_template.strip('"')
+            .strip()
+            .replace("{{QUESTION}}", model_input_text)
+            .replace("{{ANSWER}}", model_output_text)
+            .replace("{{CORRECT_RESPONSES}}", correct_responses)
+        )
+        annotator_request = Request(
+            model="openai/gpt-4-turbo-2024-04-09",
+            model_deployment="openai/gpt-4-turbo-2024-04-09",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

helm/benchmark/annotation/medication_qa_annotator.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class MedicationQAAnnotator(Annotator):
+    """The LiveQA autograder."""
+    name = "medication_qa"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        correct_response = request_state.instance.references[0].output.text
+        prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
+The user's request will be provided in these tags:
+<user_request>
+{{QUESTION}}
+</user_request>
+The model's response will be provided in these tags:
+<model_response>
+{{ANSWER}}
+</model_response>
+Some potential correct responses will be provided in these tags:
+<correct_response>
+{{CORRECT_RESPONSE}}
+</correct_response>
+Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
+# 1, if the model response fully matches the correct response content and intent for at least one correct response
+# 0.5, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
+# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
+Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
+Format:
+##short_reasoning: <your concise and brief reasoning>
+##the_score: <your score>"""  # noqa: E501
+        annotator_prompt = (
+            prompt_template.strip('"')
+            .strip()
+            .replace("{{QUESTION}}", model_input_text)
+            .replace("{{ANSWER}}", model_output_text)
+            .replace("{{CORRECT_RESPONSE}}", correct_response)
+        )
+        annotator_request = Request(
+            model="openai/gpt-4-turbo-2024-04-09",
+            model_deployment="openai/gpt-4-turbo-2024-04-09",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

helm/benchmark/augmentations/perturbation.py CHANGED Viewed

@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
         description = replace(self.description, seed=seed)
+        perturbed_input: Input
+        if instance.input.multimedia_content:
+            perturbed_media_objects = []
+            for media_object in instance.input.multimedia_content.media_objects:
+                # Apply perturbations to the text data of the multimedia content
+                if media_object.is_type("text") and media_object.text is not None:
+                    perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
+                else:
+                    perturbed_media_objects.append(media_object)
+            perturbed_input = Input(
+                multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
+            )
+        else:
+            perturbed_input = Input(text=self.perturb(instance.input.text, rng))
         # Don't modify `id` of `Instance` here.
         # All the perturbed Instances generated from a single Instance should have the same ID.
         return replace(
             instance,
-            input=Input(text=self.perturb(instance.input.text, rng)),
+            input=perturbed_input,
             references=references,
             perturbation=description,
             contrast_inputs=[instance.input],

helm/benchmark/augmentations/test_perturbation.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import List
 import unittest
+from helm.common.media_object import MediaObject, MultimediaObject
 from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
 from .data_augmenter import DataAugmenter
 from .extra_space_perturbation import ExtraSpacePerturbation
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
     assert instances[1].references[0].output.text == "some name"
+def test_multimodal_text_perturbation():
+    data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
+    input: Input = Input(
+        multimedia_content=MultimediaObject(
+            [
+                MediaObject(text="Hello what is", content_type="text/plain"),
+                MediaObject(text="your name", content_type="text/plain"),
+            ]
+        )
+    )
+    instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
+    instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
+    assert len(instances) == 2
+    # Test that the first instance is unperturbed
+    assert instances[0].id == "id0"
+    assert instances[0].perturbation is None
+    media_objects = instances[0].input.multimedia_content.media_objects
+    assert media_objects[0].text == "Hello what is"
+    assert media_objects[1].text == "your name"
+    assert instances[1].id == "id0"
+    assert instances[1].perturbation.name == "extra_space"
+    media_objects = instances[1].input.multimedia_content.media_objects
+    assert media_objects[0].text == "Hello   what   is"
+    assert media_objects[1].text == "your   name"
 def test_misspelling_perturbation():
     data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
     instance: Instance = Instance(

helm/benchmark/augmentations/translate_perturbation.py CHANGED Viewed

@@ -17,6 +17,7 @@ class TranslatePerturbation(TextPerturbation):
         language_code: str = "zh-CN"
     name: str = "translate"
+    should_perturb_references: bool = True
     def __init__(self, language_code: str):
         self.language_code: str = language_code

helm/benchmark/huggingface_registration.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+from typing import Optional, Dict, Union
 from helm.benchmark.model_deployment_registry import (
     ClientSpec,
@@ -17,14 +17,22 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 def register_huggingface_model(
-    helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
+    helm_model_name: str,
+    pretrained_model_name_or_path: str,
+    revision: Optional[str] = None,
+    openvino: Optional[bool] = False,
 ) -> None:
-    object_spec_args = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
+    object_spec_args: Dict[str, Union[str, bool]] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
     if revision:
         object_spec_args["revision"] = revision
+    if openvino:
+        object_spec_args["openvino"] = openvino
     # Auto-infer model properties from the tokenizer.
-    with HuggingFaceTokenizer.create_tokenizer(**object_spec_args) as tokenizer:
+    create_tokenizer_args: Dict[str, str] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
+    if revision:
+        create_tokenizer_args["revision"] = revision
+    with HuggingFaceTokenizer.create_tokenizer(**create_tokenizer_args) as tokenizer:
         max_sequence_length = tokenizer.model_max_length
         end_of_text_token = tokenizer.eos_token or ""
         prefix_token = tokenizer.bos_token or ""
@@ -71,7 +79,7 @@ def register_huggingface_model(
     register_tokenizer_config(tokenizer_config)
-def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> None:
+def register_huggingface_hub_model_from_flag_value(raw_model_string: str, openvino=False) -> None:
     raw_model_string_parts = raw_model_string.split("@")
     pretrained_model_name_or_path: str
     revision: Optional[str]
@@ -88,10 +96,11 @@ def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> Non
         helm_model_name=raw_model_string,
         pretrained_model_name_or_path=pretrained_model_name_or_path,
         revision=revision,
+        openvino=openvino,
     )
-def register_huggingface_local_model_from_flag_value(path: str) -> None:
+def register_huggingface_local_model_from_flag_value(path: str, openvino=False) -> None:
     if not path:
         raise ValueError("Path to Hugging Face model must be non-empty")
     path_parts = os.path.split(path)
@@ -99,4 +108,5 @@ def register_huggingface_local_model_from_flag_value(path: str) -> None:
     register_huggingface_model(
         helm_model_name=helm_model_name,
         pretrained_model_name_or_path=path,
+        openvino=openvino,
     )

helm/benchmark/metrics/air_bench_metrics.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
+from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class AIRBench2024BasicGenerationMetric(Metric):
+    """Replacement for BasicGenerationMetric for AIRBench 2024.
+    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
+    because we abuse "references" to store metadata rather than true metadata."""
+    def __init__(self):
+        super().__init__()
+        self.efficiency_metric = EfficiencyMetric()
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
+class AIRBench2024ScoreMetric(Metric):
+    """Score metrics for AIRBench 2024."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert len(request_state.instance.references) > 1
+        category_text = request_state.instance.references[0].output.text
+        category_parts = category_text.split(".")
+        assert len(category_parts) == 3
+        assert request_state.annotations
+        score = request_state.annotations["air_bench_2024"]["score"]
+        return [
+            Stat(MetricName("air_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add(
+                score
+            ),
+        ]

helm/benchmark/metrics/efficiency_metrics.py CHANGED Viewed

@@ -91,8 +91,15 @@ class EfficiencyMetric:
         window_service: WindowService = WindowServiceFactory.get_window_service(
             adapter_spec.model_deployment, tokenizer_service
         )
-        prompt: str = request_state.request.prompt
-        num_prompt_tokens: int = window_service.get_num_tokens(prompt)
+        prompt: str
+        num_prompt_tokens: int
+        if request_state.request.multimodal_prompt is not None:
+            prompt = request_state.request.multimodal_prompt.text
+            num_prompt_tokens = window_service.get_num_tokens(prompt)
+        else:
+            prompt = request_state.request.prompt
+            num_prompt_tokens = window_service.get_num_tokens(prompt)
         # Total number of tokens in the completion.
         num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])

crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl