PyPI - crfm-helm - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show

helm/benchmark/metrics/vision_language/image_metrics.py CHANGED Viewed

@@ -28,7 +28,7 @@ from helm.benchmark.metrics.vision_language.image_utils import (
     pixel_similarity,
     sift_similarity,
 )
-from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive
+from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
 try:
     from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
@@ -78,7 +78,9 @@ class AnnotatedImageMetrics(Metric):
     # Metric names
     COMPILE_METRIC: str = "compilation_success"
-    EARTH_MOVER_SIMILARITY: str = "earth_mover_similarity"
+    BLOCK_EARTH_MOVER_SIMILARITY_NORM1: str = "block_emd_similarity_white"
+    BLOCK_EARTH_MOVER_SIMILARITY_NORM2: str = "block_emd_similarity_median_color"
+    BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
     PIXEL_SIMILARITY: str = "pixel_similarity"
     SIFT_SIMILARITY: str = "sift_similarity"
     LPIPS_SIMILARITY: str = "lpips_similarity"
@@ -106,7 +108,12 @@ class AnnotatedImageMetrics(Metric):
         metrics: List[AnnotatedMetric] = [
             AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
             AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
-            AnnotatedMetric(self.EARTH_MOVER_SIMILARITY, self.compute_emd_similarity_recursive, "image_PIL"),
+            # Raw block EMD
+            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY, self.compute_block_emd_raw, "image_PIL"),
+            # Normalized block EMD against white
+            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1, self.compute_block_emd_white, "image_PIL"),
+            # Normalized block EMD against median
+            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
             AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
             AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
             AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
@@ -407,7 +414,7 @@ class AnnotatedImageMetrics(Metric):
         result = _edit_similarity(completion_tokens, truncated_reference_tokens)
         return result
-    def compute_emd_similarity_recursive(
+    def compute_block_emd_white(
         self,
         pred_image: Image.Image,
         ref_image: Image.Image,
@@ -417,17 +424,23 @@ class AnnotatedImageMetrics(Metric):
         weight_most_frequent_color: float = 0.001,
         use_tqdm: bool = False,
     ):
-        emd_value = compute_emd_recursive(
-            pred_image,
-            ref_image,
-            threshold_most_frequent_color,
-            patch_size,
-            max_num_patches,
-            weight_most_frequent_color,
-            use_tqdm,
-        )
+        """Computes the block Earth Moving Distance (EMD). This attempts to
+        speed up EMD for images with huge areas by considering movement/transformatio
+        of blocks of pixels. The score is normalized against EMD against white images
+        """
-        def do_it():
+        def compute_numerator():
+            return self.compute_block_emd_raw_wrapper(
+                pred_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+        def compute_denominator():
             constant_image = Image.new("RGB", ref_image.size, (255, 255, 255))  # default color is white
             value = compute_emd_recursive(
                 constant_image,
@@ -443,8 +456,120 @@ class AnnotatedImageMetrics(Metric):
         hash_dict = {
             "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
         }
-        cache_key = {"metric_name": f"intermediate_{self.EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
+        assert self._cache is not None
+        emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
+        emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
+        return 1.0 - emd_raw["value"] / emd_base["value"]
+    def compute_block_emd_median(
+        self,
+        pred_image: Image.Image,
+        ref_image: Image.Image,
+        threshold_most_frequent_color: float = 0.5,
+        patch_size: Tuple[int, int] = (8, 8),
+        max_num_patches: int = 100,
+        weight_most_frequent_color: float = 0.001,
+        use_tqdm: bool = False,
+    ):
+        """Same as compute_emd_similarity_recursive EXCEPT that
+        the normalization is against an image of the median color.
+        """
+        def compute_numerator():
+            return self.compute_block_emd_raw_wrapper(
+                pred_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+        def compute_denominator():
+            ref_img_np = np.array(ref_image)
+            (rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
+            # Most frequent color as base
+            constant_image = Image.new("RGB", ref_image.size, tuple(rgb_most_frequent_color))  # type: ignore
+            value = compute_emd_recursive(
+                constant_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+            return {"value": value}
+        hash_dict = {
+            "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
+        }
+        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2}", **hash_dict}
+        assert self._cache is not None
+        emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
+        emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
+        return 1.0 - emd_raw["value"] / emd_base["value"]
+    def compute_block_emd_raw(
+        self,
+        pred_image: Image.Image,
+        ref_image: Image.Image,
+        threshold_most_frequent_color: float = 0.5,
+        patch_size: Tuple[int, int] = (8, 8),
+        max_num_patches: int = 100,
+        weight_most_frequent_color: float = 0.001,
+        use_tqdm: bool = False,
+    ):
+        def compute():
+            return self.compute_block_emd_raw_wrapper(
+                pred_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+        hash_dict = {
+            "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
+        }
+        cache_key = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
         assert self._cache is not None
-        response_metric, _ = self._cache.get(cache_key, do_it)
+        emd_raw, _ = self._cache.get(cache_key, compute)
+        return emd_raw["value"]
-        return 1.0 - emd_value / response_metric["value"]
+    def compute_block_emd_raw_wrapper(
+        self,
+        pred_image: Image.Image,
+        ref_image: Image.Image,
+        threshold_most_frequent_color: float = 0.5,
+        patch_size: Tuple[int, int] = (8, 8),
+        max_num_patches: int = 100,
+        weight_most_frequent_color: float = 0.001,
+        use_tqdm: bool = False,
+    ):
+        """Computes the block Earth Moving Distance (EMD). This attempts to
+        speed up EMD for images with huge areas by considering movement/transformatio
+        of blocks of pixels. The score is normalized against EMD against white images
+        """
+        emd_value = compute_emd_recursive(
+            pred_image,
+            ref_image,
+            threshold_most_frequent_color,
+            patch_size,
+            max_num_patches,
+            weight_most_frequent_color,
+            use_tqdm,
+        )
+        return {"value": emd_value}

helm/benchmark/model_metadata_registry.py CHANGED Viewed

@@ -32,6 +32,7 @@ ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
 GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
 GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
+GOOGLE_GEMINI_PRO_VISION_V1_TAG: str = "GOOGLE_GEMINI_PRO_VISION_V1_TAG"
 GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
 # Models which emit garbage tokens when temperature=0.
@@ -159,7 +160,10 @@ def register_model_metadata(model_metadata: ModelMetadata) -> None:
 def get_model_metadata(model_name: str) -> ModelMetadata:
     """Return the `ModelMetadata` for the model name."""
     if model_name not in MODEL_NAME_TO_MODEL_METADATA:
-        raise ValueError(f"No model with name: {model_name}")
+        raise ValueError(
+            f"No model metadata for model name: {model_name} - "
+            "did you remember to add this model to model_metadata.yaml?"
+        )
     return MODEL_NAME_TO_MODEL_METADATA[model_name]

helm/benchmark/run_expander.py CHANGED Viewed

@@ -8,12 +8,14 @@ from helm.benchmark.model_metadata_registry import (
     get_all_code_models,
     get_all_models,
     get_all_text_models,
+    get_model_metadata,
     get_model_names_with_tag,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
     TEXT_TO_IMAGE_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
+    INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
@@ -322,6 +324,16 @@ class AnthropicClaude3RunExpander(RunExpander):
     name = "claude_3"
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        # Remove all stop sequences that do not contain non-whitespace characters.
+        # This prevents the Anthropic API from returnin the following error:
+        # "stop_sequences: each stop sequence must contain non-whitespace"
+        stop_sequences_with_non_whitespace = [
+            stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
+        ]
+        run_spec = replace(
+            run_spec,
+            adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
+        )
         if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
             instructions = "Answer with only a single letter."
             if run_spec.adapter_spec.instructions:
@@ -335,78 +347,37 @@ class AnthropicClaude3RunExpander(RunExpander):
         return [run_spec]
-class OpenAIRunExpander(RunExpander):
-    """
-    Custom prompt for OpenAI models.
-    These models need more explicit instructions about following the format.
-    """
-    # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
-    name = "openai"
+class FollowFormatInstructionsRunExpander(RunExpander):
+    """Adds more explicit instructions about following the format to prompts.
-    def __init__(self):
-        pass
+    The argument controlls which models will receive these instructions.
+    If "all", all models receive these instructions.
+    If "instruct", only instruction-following models receive these instructions.
-    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
-        return [
-            replace(
-                run_spec,
-                name=run_spec.name,
-                adapter_spec=replace(
-                    run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
-                ),
-            ),
-        ]
+    Only supports the generation adaptation method. Raises an error if used on
+    a RunSpec that uses a different adaptation method.
-class GoogleRunExpander(RunExpander):
-    """
-    Custom prompt for Google models.
-    These models need more explicit instructions about following the format.
+    Note: For legacy backwards compatibility reasons, despite the use of the word
+    "instructions" in this run expander's name, this run expander actually
+    modifies the global_prefix and the global_suffix of the AdapterSpec rather than
+    the instructions.
     """
-    # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
+    name = "follow_format_instructions"
-    name = "google"
+    def __init__(self, value: str):
+        if value != "all" and value != "instruct":
+            raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
+        self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
-        return [
-            replace(
-                run_spec,
-                name=run_spec.name,
-                adapter_spec=replace(
-                    run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
-                ),
-            ),
-        ]
+            raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
-class MistralRunExpander(RunExpander):
-    """Custom prompt for Mistral models."""
-    # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
-    name = "output_format_instructions"
-    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+        if (
+            self.value == "instruct"
+            and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
+        ):
             return [run_spec]
         return [
@@ -539,7 +510,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
         "one": [1],
         "all": [0, 1, 2, 4, 8, 16],  # Cap at 16 due to limited context length
         "big_bench_few_shot_setting": [0, 1, 2, 3],  # Commonly used few-shot setting in BIG-bench
-        "heim_human_eval": [0, 1, 2, 4, 8],
+        "vhelm": [0, 1, 2, 4, 8],
     }
@@ -1415,6 +1386,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     NewlineRunExpander,
     StopRunExpander,
     FormatPromptRunExpander,
+    FollowFormatInstructionsRunExpander,
     AddToStopRunExpander,
     GlobalPrefixRunExpander,
     NumTrainTrialsRunExpander,

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import List
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
     ADAPT_MULTIPLE_CHOICE_JOINT,
-    ADAPT_GENERATION_MULTIMODAL,
 )
 from helm.benchmark.model_deployment_registry import (
     ModelDeployment,
@@ -14,22 +13,24 @@ from helm.benchmark.model_deployment_registry import (
 from helm.benchmark.model_metadata_registry import (
     ANTHROPIC_CLAUDE_1_MODEL_TAG,
     ANTHROPIC_CLAUDE_2_MODEL_TAG,
+    ANTHROPIC_CLAUDE_3_MODEL_TAG,
     BUGGY_TEMP_0_TAG,
     CHATML_MODEL_TAG,
-    GOOGLE_GEMINI_MODEL_TAG,
+    GOOGLE_GEMINI_PRO_VISION_V1_TAG,
     IDEFICS_INSTRUCT_MODEL_TAG,
-    IDEFICS_MODEL_TAG,
     LLAVA_MODEL_TAG,
     OPEN_FLAMINGO_MODEL_TAG,
-    VISION_LANGUAGE_MODEL_TAG,
     NLG_PREFIX_TAG,
     NO_NEWLINES_TAG,
+    VISION_LANGUAGE_MODEL_TAG,
+    IDEFICS_MODEL_TAG,
     ModelMetadata,
     get_model_metadata,
 )
 from helm.benchmark.run_expander import (
     RUN_EXPANDERS,
     AnthropicClaude2RunExpander,
+    AnthropicClaude3RunExpander,
     ChatMLRunExpander,
     GlobalPrefixRunExpander,
     IDEFICSInstructRunExpander,
@@ -125,20 +126,20 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
         if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
             run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
-        # Google Gemini Vision returns an empty completion or throws an error if max_tokens is 1
+        # Anthropic Claude 3
+        if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
+            run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
+        # Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
         if (
             VISION_LANGUAGE_MODEL_TAG in model.tags
-            and GOOGLE_GEMINI_MODEL_TAG in model.tags
+            and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
             and run_spec.adapter_spec.max_tokens == 1
         ):
             run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
         # IDEFICS special handling
         if IDEFICS_MODEL_TAG in model.tags:
-            # IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
-            if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
-                run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
             if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
                 run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))

crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl