PyPI - crfm-helm - Versions diffs - 0.5.0__tar.gz → 0.5.1__tar.gz - Mend

crfm-helm 0.5.0tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (665) hide show

{crfm_helm-0.5.0/src/crfm_helm.egg-info → crfm_helm-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.5.0
+Version: 0.5.1
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -25,7 +25,7 @@ Requires-Dist: tqdm~=4.64
 Requires-Dist: zstandard~=0.18.0
 Requires-Dist: sqlitedict~=1.7
 Requires-Dist: bottle~=0.12.23
-Requires-Dist: datasets~=2.15
+Requires-Dist: datasets~=2.17
 Requires-Dist: pyarrow>=11.0.0
 Requires-Dist: pyarrow-hotfix~=0.6
 Requires-Dist: nltk~=3.7
@@ -34,7 +34,7 @@ Requires-Dist: rouge-score~=0.1.2
 Requires-Dist: scipy~=1.10
 Requires-Dist: uncertainty-calibration~=0.1.4
 Requires-Dist: scikit-learn~=1.1
-Requires-Dist: transformers~=4.37
+Requires-Dist: transformers~=4.40
 Requires-Dist: torch<3.0.0,>=1.13.1
 Requires-Dist: torchvision<3.0.0,>=0.14.1
 Requires-Dist: google-api-python-client~=2.64
@@ -94,6 +94,8 @@ Requires-Dist: tiktoken~=0.3.3; extra == "openai"
 Requires-Dist: pydantic~=2.0; extra == "openai"
 Provides-Extra: google
 Requires-Dist: google-cloud-aiplatform~=1.44; extra == "google"
+Provides-Extra: together
+Requires-Dist: together~=1.1; extra == "together"
 Provides-Extra: tsinghua
 Requires-Dist: icetk~=0.0.4; extra == "tsinghua"
 Provides-Extra: yandex
@@ -106,6 +108,7 @@ Requires-Dist: crfm-helm[anthropic]; extra == "models"
 Requires-Dist: crfm-helm[google]; extra == "models"
 Requires-Dist: crfm-helm[mistral]; extra == "models"
 Requires-Dist: crfm-helm[openai]; extra == "models"
+Requires-Dist: crfm-helm[together]; extra == "models"
 Requires-Dist: crfm-helm[tsinghua]; extra == "models"
 Requires-Dist: crfm-helm[yandex]; extra == "models"
 Provides-Extra: vlm
@@ -119,6 +122,7 @@ Requires-Dist: scipy~=1.10; extra == "vlm"
 Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "vlm"
 Requires-Dist: crfm-helm[images]; extra == "vlm"
 Requires-Dist: crfm-helm[image2structure]; extra == "vlm"
+Requires-Dist: pycocoevalcap~=1.2; extra == "vlm"
 Provides-Extra: image2structure
 Requires-Dist: crfm-helm[images]; extra == "image2structure"
 Requires-Dist: latex~=0.7.0; extra == "image2structure"

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = crfm-helm
-version = 0.5.0
+version = 0.5.1
 author = Stanford CRFM
 author_email = contact-crfm@stanford.edu
 description = Benchmark for language models
@@ -35,7 +35,7 @@ install_requires =
 	sqlitedict~=1.7
 	bottle~=0.12.23
-	datasets~=2.15
+	datasets~=2.17
 	pyarrow>=11.0.0  # Pinned transitive dependency for datasets; workaround for #1026
 	pyarrow-hotfix~=0.6  # Hotfix for CVE-2023-47248
@@ -46,7 +46,7 @@ install_requires =
 	uncertainty-calibration~=0.1.4
 	scikit-learn~=1.1
-	transformers~=4.37  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
+	transformers~=4.40  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
 	torch>=1.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
 	torchvision>=0.14.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
@@ -109,6 +109,8 @@ openai =
 	pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
 google =
 	google-cloud-aiplatform~=1.44
+together =
+	together~=1.1
 tsinghua =
 	icetk~=0.0.4
 yandex =
@@ -121,6 +123,7 @@ models =
 	crfm-helm[google]
 	crfm-helm[mistral]
 	crfm-helm[openai]
+	crfm-helm[together]
 	crfm-helm[tsinghua]
 	crfm-helm[yandex]
 vlm =
@@ -138,6 +141,8 @@ vlm =
 	crfm-helm[images]
 	crfm-helm[image2structure]
+	pycocoevalcap~=1.2
 image2structure =
 	crfm-helm[images]

{crfm_helm-0.5.0 → crfm_helm-0.5.1/src/crfm_helm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.5.0
+Version: 0.5.1
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -25,7 +25,7 @@ Requires-Dist: tqdm~=4.64
 Requires-Dist: zstandard~=0.18.0
 Requires-Dist: sqlitedict~=1.7
 Requires-Dist: bottle~=0.12.23
-Requires-Dist: datasets~=2.15
+Requires-Dist: datasets~=2.17
 Requires-Dist: pyarrow>=11.0.0
 Requires-Dist: pyarrow-hotfix~=0.6
 Requires-Dist: nltk~=3.7
@@ -34,7 +34,7 @@ Requires-Dist: rouge-score~=0.1.2
 Requires-Dist: scipy~=1.10
 Requires-Dist: uncertainty-calibration~=0.1.4
 Requires-Dist: scikit-learn~=1.1
-Requires-Dist: transformers~=4.37
+Requires-Dist: transformers~=4.40
 Requires-Dist: torch<3.0.0,>=1.13.1
 Requires-Dist: torchvision<3.0.0,>=0.14.1
 Requires-Dist: google-api-python-client~=2.64
@@ -94,6 +94,8 @@ Requires-Dist: tiktoken~=0.3.3; extra == "openai"
 Requires-Dist: pydantic~=2.0; extra == "openai"
 Provides-Extra: google
 Requires-Dist: google-cloud-aiplatform~=1.44; extra == "google"
+Provides-Extra: together
+Requires-Dist: together~=1.1; extra == "together"
 Provides-Extra: tsinghua
 Requires-Dist: icetk~=0.0.4; extra == "tsinghua"
 Provides-Extra: yandex
@@ -106,6 +108,7 @@ Requires-Dist: crfm-helm[anthropic]; extra == "models"
 Requires-Dist: crfm-helm[google]; extra == "models"
 Requires-Dist: crfm-helm[mistral]; extra == "models"
 Requires-Dist: crfm-helm[openai]; extra == "models"
+Requires-Dist: crfm-helm[together]; extra == "models"
 Requires-Dist: crfm-helm[tsinghua]; extra == "models"
 Requires-Dist: crfm-helm[yandex]; extra == "models"
 Provides-Extra: vlm
@@ -119,6 +122,7 @@ Requires-Dist: scipy~=1.10; extra == "vlm"
 Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "vlm"
 Requires-Dist: crfm-helm[images]; extra == "vlm"
 Requires-Dist: crfm-helm[image2structure]; extra == "vlm"
+Requires-Dist: pycocoevalcap~=1.2; extra == "vlm"
 Provides-Extra: image2structure
 Requires-Dist: crfm-helm[images]; extra == "image2structure"
 Requires-Dist: latex~=0.7.0; extra == "image2structure"

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/crfm_helm.egg-info/SOURCES.txt RENAMED Viewed

@@ -32,7 +32,6 @@ src/helm/benchmark/server.py
 src/helm/benchmark/slurm_jobs.py
 src/helm/benchmark/slurm_runner.py
 src/helm/benchmark/test_data_preprocessor.py
-src/helm/benchmark/test_model_deployment_definition.py
 src/helm/benchmark/test_run_expander.py
 src/helm/benchmark/tokenizer_config_registry.py
 src/helm/benchmark/adaptation/__init__.py
@@ -327,13 +326,23 @@ src/helm/benchmark/scenarios/image_generation/relational_understanding_scenario.
 src/helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py
 src/helm/benchmark/scenarios/image_generation/winoground_scenario.py
 src/helm/benchmark/scenarios/vision_language/__init__.py
+src/helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py
 src/helm/benchmark/scenarios/vision_language/bingo_scenario.py
+src/helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py
+src/helm/benchmark/scenarios/vision_language/flickr30k_scenario.py
+src/helm/benchmark/scenarios/vision_language/gqa_scenario.py
 src/helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py
 src/helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py
+src/helm/benchmark/scenarios/vision_language/math_vista_scenario.py
 src/helm/benchmark/scenarios/vision_language/mementos_scenario.py
+src/helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py
 src/helm/benchmark/scenarios/vision_language/mme_scenario.py
 src/helm/benchmark/scenarios/vision_language/mmmu_scenario.py
+src/helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py
+src/helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py
 src/helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py
+src/helm/benchmark/scenarios/vision_language/originality_scenario.py
+src/helm/benchmark/scenarios/vision_language/pairs_scenario.py
 src/helm/benchmark/scenarios/vision_language/pope_scenario.py
 src/helm/benchmark/scenarios/vision_language/seed_bench_scenario.py
 src/helm/benchmark/scenarios/vision_language/unicorn_scenario.py
@@ -360,10 +369,12 @@ src/helm/benchmark/static/info-icon.png
 src/helm/benchmark/static/json-urls.js
 src/helm/benchmark/static/plot-captions.js
 src/helm/benchmark/static/schema_classic.yaml
+src/helm/benchmark/static/schema_image2structure.yaml
 src/helm/benchmark/static/schema_instruction_following.yaml
 src/helm/benchmark/static/schema_lite.yaml
 src/helm/benchmark/static/schema_mmlu.yaml
 src/helm/benchmark/static/schema_unitxt.yaml
+src/helm/benchmark/static/schema_vhelm_lite.yaml
 src/helm/benchmark/static/schema_vlm.yaml
 src/helm/benchmark/static/utils.js
 src/helm/benchmark/static/images/crfm-logo.png
@@ -399,8 +410,8 @@ src/helm/benchmark/static_build/assets/google-06d997ad.png
 src/helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png
 src/helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png
 src/helm/benchmark/static_build/assets/helmhero-28e90f4d.png
-src/helm/benchmark/static_build/assets/index-5088afcb.css
-src/helm/benchmark/static_build/assets/index-d839df55.js
+src/helm/benchmark/static_build/assets/index-737eef9e.js
+src/helm/benchmark/static_build/assets/index-878a1094.css
 src/helm/benchmark/static_build/assets/meta-5580e9f1.png
 src/helm/benchmark/static_build/assets/microsoft-f5ee5016.png
 src/helm/benchmark/static_build/assets/mistral-18e1be23.png
@@ -545,6 +556,7 @@ src/helm/clients/image_generation/mindalle/utils/config.py
 src/helm/clients/image_generation/mindalle/utils/sampling.py
 src/helm/clients/image_generation/mindalle/utils/utils.py
 src/helm/clients/vision_language/__init__.py
+src/helm/clients/vision_language/huggingface_vision2seq_client.py
 src/helm/clients/vision_language/huggingface_vlm_client.py
 src/helm/clients/vision_language/idefics_client.py
 src/helm/clients/vision_language/open_flamingo_client.py

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/crfm_helm.egg-info/requires.txt RENAMED Viewed

@@ -10,7 +10,7 @@ tqdm~=4.64
 zstandard~=0.18.0
 sqlitedict~=1.7
 bottle~=0.12.23
-datasets~=2.15
+datasets~=2.17
 pyarrow>=11.0.0
 pyarrow-hotfix~=0.6
 nltk~=3.7
@@ -19,7 +19,7 @@ rouge-score~=0.1.2
 scipy~=1.10
 uncertainty-calibration~=0.1.4
 scikit-learn~=1.1
-transformers~=4.37
+transformers~=4.40
 torch<3.0.0,>=1.13.1
 torchvision<3.0.0,>=0.14.1
 google-api-python-client~=2.64
@@ -137,6 +137,7 @@ crfm-helm[anthropic]
 crfm-helm[google]
 crfm-helm[mistral]
 crfm-helm[openai]
+crfm-helm[together]
 crfm-helm[tsinghua]
 crfm-helm[yandex]
@@ -167,6 +168,9 @@ simple-slurm~=0.2.6
 [summarization]
 summ-eval~=0.892
+[together]
+together~=1.1
 [tsinghua]
 icetk~=0.0.4
@@ -184,6 +188,7 @@ scipy~=1.10
 torchvision<3.0.0,>=0.14.1
 crfm-helm[images]
 crfm-helm[image2structure]
+pycocoevalcap~=1.2
 [yandex]
 sentencepiece~=0.1.97

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py RENAMED Viewed

@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
         # Prompt
         prompt = MultimodalPrompt(
             global_prefix=self.adapter_spec.global_prefix,
+            global_suffix=self.adapter_spec.global_suffix,
             instructions=self.adapter_spec.instructions,
             train_instance_blocks=train_instance_blocks,
             eval_instance_block=eval_instance_block,

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py RENAMED Viewed

@@ -11,6 +11,9 @@ class MultimodalPrompt:
     # Global prefix, carried over from `AdapterSpec`
     global_prefix: str
+    # Global suffix, carried over from `AdapterSpec`
+    global_suffix: str
     # Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
     instance_prefix: str
@@ -47,6 +50,10 @@ class MultimodalPrompt:
         if self.global_prefix:
             result = result.add_textual_prefix(self.global_prefix)
+        # Add the global prefix if one exists
+        if self.global_suffix:
+            result = result.add_textual_suffix(self.global_suffix)
         return result
     @property

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py RENAMED Viewed

@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
         prompt = MultimodalPrompt(
             global_prefix="[START]",
+            global_suffix="",
             instance_prefix="\n",
             instructions="Please answer the following questions about the images.",
             train_instance_blocks=train_instance_blocks,
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
         prompt = MultimodalPrompt(
             global_prefix="",
+            global_suffix="",
             instance_prefix="\n",
             instructions="",
             train_instance_blocks=[],

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py RENAMED Viewed

@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
     """Annotator that compiles the text completions into a music sheet with LilyPond."""
     name: str = "lilypond_compiler"
-    base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
+    base_path = "lilypond-2.24.3/bin"
     def __init__(self, cache_config: CacheConfig, file_storage_path: str):
         super().__init__(cache_config, file_storage_path)

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/augmentations/perturbation.py RENAMED Viewed

@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
         description = replace(self.description, seed=seed)
+        perturbed_input: Input
+        if instance.input.multimedia_content:
+            perturbed_media_objects = []
+            for media_object in instance.input.multimedia_content.media_objects:
+                # Apply perturbations to the text data of the multimedia content
+                if media_object.is_type("text") and media_object.text is not None:
+                    perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
+                else:
+                    perturbed_media_objects.append(media_object)
+            perturbed_input = Input(
+                multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
+            )
+        else:
+            perturbed_input = Input(text=self.perturb(instance.input.text, rng))
         # Don't modify `id` of `Instance` here.
         # All the perturbed Instances generated from a single Instance should have the same ID.
         return replace(
             instance,
-            input=Input(text=self.perturb(instance.input.text, rng)),
+            input=perturbed_input,
             references=references,
             perturbation=description,
             contrast_inputs=[instance.input],

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/augmentations/test_perturbation.py RENAMED Viewed

@@ -2,6 +2,7 @@
 from typing import List
 import unittest
+from helm.common.media_object import MediaObject, MultimediaObject
 from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
 from .data_augmenter import DataAugmenter
 from .extra_space_perturbation import ExtraSpacePerturbation
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
     assert instances[1].references[0].output.text == "some name"
+def test_multimodal_text_perturbation():
+    data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
+    input: Input = Input(
+        multimedia_content=MultimediaObject(
+            [
+                MediaObject(text="Hello what is", content_type="text/plain"),
+                MediaObject(text="your name", content_type="text/plain"),
+            ]
+        )
+    )
+    instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
+    instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
+    assert len(instances) == 2
+    # Test that the first instance is unperturbed
+    assert instances[0].id == "id0"
+    assert instances[0].perturbation is None
+    media_objects = instances[0].input.multimedia_content.media_objects
+    assert media_objects[0].text == "Hello what is"
+    assert media_objects[1].text == "your name"
+    assert instances[1].id == "id0"
+    assert instances[1].perturbation.name == "extra_space"
+    media_objects = instances[1].input.multimedia_content.media_objects
+    assert media_objects[0].text == "Hello   what   is"
+    assert media_objects[1].text == "your   name"
 def test_misspelling_perturbation():
     data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
     instance: Instance = Instance(

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/metrics/efficiency_metrics.py RENAMED Viewed

@@ -91,8 +91,15 @@ class EfficiencyMetric:
         window_service: WindowService = WindowServiceFactory.get_window_service(
             adapter_spec.model_deployment, tokenizer_service
         )
-        prompt: str = request_state.request.prompt
-        num_prompt_tokens: int = window_service.get_num_tokens(prompt)
+        prompt: str
+        num_prompt_tokens: int
+        if request_state.request.multimodal_prompt is not None:
+            prompt = request_state.request.multimodal_prompt.text
+            num_prompt_tokens = window_service.get_num_tokens(prompt)
+        else:
+            prompt = request_state.request.prompt
+            num_prompt_tokens = window_service.get_num_tokens(prompt)
         # Total number of tokens in the completion.
         num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/metrics/evaluate_reference_metrics.py RENAMED Viewed

@@ -10,6 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.code_scenario import CodeReference
 from helm.benchmark.scenarios.scenario import Reference
+from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import GeneratedOutput
 from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
 from nltk.metrics.scores import f_measure
@@ -21,6 +22,7 @@ import string
 from . import code_metrics_helper
 import nltk
 try:
     nltk.data.find("tokenizers/punkt")
 except LookupError:
@@ -188,6 +190,19 @@ def bleu_4(gold: str, pred: str) -> float:
     return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
+def cider(gold: str, pred: str) -> float:
+    try:
+        from pycocoevalcap.cider.cider import Cider
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["vlm"])
+    cider_evaluator = Cider()
+    candidate = {"caption": [pred]}
+    reference = {"caption": [gold]}
+    average_score, _ = cider_evaluator.compute_score(reference, candidate)
+    return average_score
 def extract_set_from_text(
     set_str: str,
     set_start_str: str = " is ",
@@ -325,6 +340,7 @@ def compute_reference_metrics(
         "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
         "code_eval_acc": code_eval,
         "pass": code_eval,
+        "cider": cider,
         "f1_score": f1_score,
         "rouge_1": get_rouge_function("rouge1"),
         "rouge_2": get_rouge_function("rouge2"),

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/metrics/vision_language/image_metrics.py RENAMED Viewed

@@ -28,7 +28,7 @@ from helm.benchmark.metrics.vision_language.image_utils import (
     pixel_similarity,
     sift_similarity,
 )
-from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive
+from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
 try:
     from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
@@ -78,7 +78,9 @@ class AnnotatedImageMetrics(Metric):
     # Metric names
     COMPILE_METRIC: str = "compilation_success"
-    EARTH_MOVER_SIMILARITY: str = "earth_mover_similarity"
+    BLOCK_EARTH_MOVER_SIMILARITY_NORM1: str = "block_emd_similarity_white"
+    BLOCK_EARTH_MOVER_SIMILARITY_NORM2: str = "block_emd_similarity_median_color"
+    BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
     PIXEL_SIMILARITY: str = "pixel_similarity"
     SIFT_SIMILARITY: str = "sift_similarity"
     LPIPS_SIMILARITY: str = "lpips_similarity"
@@ -106,7 +108,12 @@ class AnnotatedImageMetrics(Metric):
         metrics: List[AnnotatedMetric] = [
             AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
             AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
-            AnnotatedMetric(self.EARTH_MOVER_SIMILARITY, self.compute_emd_similarity_recursive, "image_PIL"),
+            # Raw block EMD
+            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY, self.compute_block_emd_raw, "image_PIL"),
+            # Normalized block EMD against white
+            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1, self.compute_block_emd_white, "image_PIL"),
+            # Normalized block EMD against median
+            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
             AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
             AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
             AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
@@ -407,7 +414,7 @@ class AnnotatedImageMetrics(Metric):
         result = _edit_similarity(completion_tokens, truncated_reference_tokens)
         return result
-    def compute_emd_similarity_recursive(
+    def compute_block_emd_white(
         self,
         pred_image: Image.Image,
         ref_image: Image.Image,
@@ -417,17 +424,23 @@ class AnnotatedImageMetrics(Metric):
         weight_most_frequent_color: float = 0.001,
         use_tqdm: bool = False,
     ):
-        emd_value = compute_emd_recursive(
-            pred_image,
-            ref_image,
-            threshold_most_frequent_color,
-            patch_size,
-            max_num_patches,
-            weight_most_frequent_color,
-            use_tqdm,
-        )
+        """Computes the block Earth Moving Distance (EMD). This attempts to
+        speed up EMD for images with huge areas by considering movement/transformatio
+        of blocks of pixels. The score is normalized against EMD against white images
+        """
-        def do_it():
+        def compute_numerator():
+            return self.compute_block_emd_raw_wrapper(
+                pred_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+        def compute_denominator():
             constant_image = Image.new("RGB", ref_image.size, (255, 255, 255))  # default color is white
             value = compute_emd_recursive(
                 constant_image,
@@ -443,8 +456,120 @@ class AnnotatedImageMetrics(Metric):
         hash_dict = {
             "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
         }
-        cache_key = {"metric_name": f"intermediate_{self.EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
+        assert self._cache is not None
+        emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
+        emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
+        return 1.0 - emd_raw["value"] / emd_base["value"]
+    def compute_block_emd_median(
+        self,
+        pred_image: Image.Image,
+        ref_image: Image.Image,
+        threshold_most_frequent_color: float = 0.5,
+        patch_size: Tuple[int, int] = (8, 8),
+        max_num_patches: int = 100,
+        weight_most_frequent_color: float = 0.001,
+        use_tqdm: bool = False,
+    ):
+        """Same as compute_emd_similarity_recursive EXCEPT that
+        the normalization is against an image of the median color.
+        """
+        def compute_numerator():
+            return self.compute_block_emd_raw_wrapper(
+                pred_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+        def compute_denominator():
+            ref_img_np = np.array(ref_image)
+            (rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
+            # Most frequent color as base
+            constant_image = Image.new("RGB", ref_image.size, tuple(rgb_most_frequent_color))  # type: ignore
+            value = compute_emd_recursive(
+                constant_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+            return {"value": value}
+        hash_dict = {
+            "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
+        }
+        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2}", **hash_dict}
+        assert self._cache is not None
+        emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
+        emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
+        return 1.0 - emd_raw["value"] / emd_base["value"]
+    def compute_block_emd_raw(
+        self,
+        pred_image: Image.Image,
+        ref_image: Image.Image,
+        threshold_most_frequent_color: float = 0.5,
+        patch_size: Tuple[int, int] = (8, 8),
+        max_num_patches: int = 100,
+        weight_most_frequent_color: float = 0.001,
+        use_tqdm: bool = False,
+    ):
+        def compute():
+            return self.compute_block_emd_raw_wrapper(
+                pred_image,
+                ref_image,
+                threshold_most_frequent_color,
+                patch_size,
+                max_num_patches,
+                weight_most_frequent_color,
+                use_tqdm,
+            )
+        hash_dict = {
+            "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
+        }
+        cache_key = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
         assert self._cache is not None
-        response_metric, _ = self._cache.get(cache_key, do_it)
+        emd_raw, _ = self._cache.get(cache_key, compute)
+        return emd_raw["value"]
-        return 1.0 - emd_value / response_metric["value"]
+    def compute_block_emd_raw_wrapper(
+        self,
+        pred_image: Image.Image,
+        ref_image: Image.Image,
+        threshold_most_frequent_color: float = 0.5,
+        patch_size: Tuple[int, int] = (8, 8),
+        max_num_patches: int = 100,
+        weight_most_frequent_color: float = 0.001,
+        use_tqdm: bool = False,
+    ):
+        """Computes the block Earth Moving Distance (EMD). This attempts to
+        speed up EMD for images with huge areas by considering movement/transformatio
+        of blocks of pixels. The score is normalized against EMD against white images
+        """
+        emd_value = compute_emd_recursive(
+            pred_image,
+            ref_image,
+            threshold_most_frequent_color,
+            patch_size,
+            max_num_patches,
+            weight_most_frequent_color,
+            use_tqdm,
+        )
+        return {"value": emd_value}

{crfm_helm-0.5.0 → crfm_helm-0.5.1}/src/helm/benchmark/model_metadata_registry.py RENAMED Viewed

@@ -32,6 +32,7 @@ ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
 GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
 GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
+GOOGLE_GEMINI_PRO_VISION_V1_TAG: str = "GOOGLE_GEMINI_PRO_VISION_V1_TAG"
 GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
 # Models which emit garbage tokens when temperature=0.
@@ -159,7 +160,10 @@ def register_model_metadata(model_metadata: ModelMetadata) -> None:
 def get_model_metadata(model_name: str) -> ModelMetadata:
     """Return the `ModelMetadata` for the model name."""
     if model_name not in MODEL_NAME_TO_MODEL_METADATA:
-        raise ValueError(f"No model with name: {model_name}")
+        raise ValueError(
+            f"No model metadata for model name: {model_name} - "
+            "did you remember to add this model to model_metadata.yaml?"
+        )
     return MODEL_NAME_TO_MODEL_METADATA[model_name]

crfm-helm 0.5.0__tar.gz → 0.5.1__tar.gz

crfm-helm 0.5.0tar.gz → 0.5.1tar.gz