PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
helm/benchmark/adaptation/common_adapter_specs.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
helm/benchmark/annotation/call_center_annotator.py +247 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +68 -0
helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
helm/benchmark/annotation/live_qa_annotator.py +71 -0
helm/benchmark/annotation/medication_qa_annotator.py +68 -0
helm/benchmark/annotation/model_as_judge.py +45 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
helm/benchmark/annotation/xstest_annotator.py +110 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/bhasa_metrics.py +188 -0
helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
helm/benchmark/metrics/code_metrics_helper.py +11 -1
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/safety_metrics.py +57 -0
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
helm/benchmark/metrics/vision_language/image_utils.py +1 -1
helm/benchmark/model_metadata_registry.py +3 -3
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_run_entry.py +1 -0
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +31 -2
helm/benchmark/run_expander.py +113 -10
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
helm/benchmark/run_specs/call_center_run_specs.py +152 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
helm/benchmark/run_specs/experimental_run_specs.py +85 -0
helm/benchmark/run_specs/finance_run_specs.py +110 -0
helm/benchmark/run_specs/safety_run_specs.py +154 -0
helm/benchmark/run_specs/vlm_run_specs.py +251 -57
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
helm/benchmark/scenarios/banking77_scenario.py +51 -0
helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
helm/benchmark/scenarios/financebench_scenario.py +53 -0
helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
helm/benchmark/scenarios/scenario.py +1 -1
helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +2 -8
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +1 -6
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_bhasa.yaml +709 -0
helm/benchmark/static/schema_call_center.yaml +232 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +189 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_safety.yaml +247 -0
helm/benchmark/static/schema_tables.yaml +317 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/test_openai_window_service.py +8 -8
helm/clients/ai21_client.py +71 -1
helm/clients/anthropic_client.py +50 -28
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +79 -19
helm/clients/nvidia_nim_client.py +35 -0
helm/clients/openai_client.py +11 -5
helm/clients/palmyra_client.py +25 -0
helm/clients/perspective_api_client.py +11 -6
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +7 -9
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/open_flamingo_client.py +1 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +99 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +25 -0
helm/common/mongo_key_value_store.py +2 -1
helm/common/request.py +16 -0
helm/config/model_deployments.yaml +740 -363
helm/config/model_metadata.yaml +824 -128
helm/config/tokenizer_configs.yaml +207 -10
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/example_queries.py +14 -21
helm/proxy/services/server_service.py +2 -3
helm/proxy/token_counters/test_auto_token_counter.py +2 -2
helm/tokenizers/ai21_tokenizer.py +51 -59
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +29 -62
helm/tokenizers/huggingface_tokenizer.py +35 -13
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/general.js +0 -122
helm/benchmark/static/images/crfm-logo.png +0 -0
helm/benchmark/static/images/helm-logo-simple.png +0 -0
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/info-icon.png +0 -0
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/schema_image2structure.yaml +0 -304
helm/benchmark/static/utils.js +0 -285
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
helm/benchmark/window_services/ai21_window_service.py +0 -247
helm/benchmark/window_services/cohere_window_service.py +0 -101
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -75
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -327
helm/tokenizers/ice_tokenizer.py +0 -30
helm/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0

helm/benchmark/metrics/vision_language/image_metrics.py CHANGED Viewed

@@ -28,14 +28,14 @@ from helm.benchmark.metrics.vision_language.image_utils import (
     pixel_similarity,
     sift_similarity,
 )
-from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
+from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color, to_gray
 try:
     from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
     from PIL import Image
     import imagehash
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 def pad(small_image: Image.Image, large_image: Image.Image, axis: int) -> Image.Image:
@@ -78,9 +78,8 @@ class AnnotatedImageMetrics(Metric):
     # Metric names
     COMPILE_METRIC: str = "compilation_success"
-    BLOCK_EARTH_MOVER_SIMILARITY_NORM1: str = "block_emd_similarity_white"
-    BLOCK_EARTH_MOVER_SIMILARITY_NORM2: str = "block_emd_similarity_median_color"
-    BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
+    EARTH_MOVER_SIMILARITY = "earth_mover_similarity"
+    BLOCK_EMD: str = "block_emd"
     PIXEL_SIMILARITY: str = "pixel_similarity"
     SIFT_SIMILARITY: str = "sift_similarity"
     LPIPS_SIMILARITY: str = "lpips_similarity"
@@ -108,12 +107,10 @@ class AnnotatedImageMetrics(Metric):
         metrics: List[AnnotatedMetric] = [
             AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
             AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
-            # Raw block EMD
-            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY, self.compute_block_emd_raw, "image_PIL"),
-            # Normalized block EMD against white
-            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1, self.compute_block_emd_white, "image_PIL"),
-            # Normalized block EMD against median
-            AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
+            AnnotatedMetric(self.BLOCK_EMD, self.compute_block_emd_raw, "image_PIL"),  # Raw block-EMD
+            AnnotatedMetric(
+                self.EARTH_MOVER_SIMILARITY, self.ems, "image_PIL"
+            ),  # Normalized block-EMD against black/white
             AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
             AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
             AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
@@ -391,9 +388,15 @@ class AnnotatedImageMetrics(Metric):
         features1 = self._get_inception_features(img1_tensor)
         features2 = self._get_inception_features(img2_tensor)
-        fid_score = self._calculate_fid(features1, features2)
-        normalize_fid: float = np.exp(-fid_score * self.NORMALIZE_FID_FACTOR)
-        return normalize_fid
+        # TODO: Justify the value of the constant here or remove this code to only keep the cosine similarity.
+        # fid_score = self._calculate_fid(features1, features2)
+        # normalize_fid: float = np.exp(-fid_score * self.NORMALIZE_FID_FACTOR)
+        # return normalize_fid
+        # Use the cosine similarity between the features as a proxy for FID
+        # Return a score between 0 and 1, where 1 is the most similar
+        score = 0.5 * (1 + np.dot(features1[0], features2[0]) / (np.linalg.norm(features1) * np.linalg.norm(features2)))
+        return score
     def compute_ssim(self, generated_image: np.ndarray, reference_image: np.ndarray) -> float:
         """Compute the Structural Similarity Index (SSIM) between the generated and reference images."""
@@ -414,58 +417,7 @@ class AnnotatedImageMetrics(Metric):
         result = _edit_similarity(completion_tokens, truncated_reference_tokens)
         return result
-    def compute_block_emd_white(
-        self,
-        pred_image: Image.Image,
-        ref_image: Image.Image,
-        threshold_most_frequent_color: float = 0.5,
-        patch_size: Tuple[int, int] = (8, 8),
-        max_num_patches: int = 100,
-        weight_most_frequent_color: float = 0.001,
-        use_tqdm: bool = False,
-    ):
-        """Computes the block Earth Moving Distance (EMD). This attempts to
-        speed up EMD for images with huge areas by considering movement/transformatio
-        of blocks of pixels. The score is normalized against EMD against white images
-        """
-        def compute_numerator():
-            return self.compute_block_emd_raw_wrapper(
-                pred_image,
-                ref_image,
-                threshold_most_frequent_color,
-                patch_size,
-                max_num_patches,
-                weight_most_frequent_color,
-                use_tqdm,
-            )
-        def compute_denominator():
-            constant_image = Image.new("RGB", ref_image.size, (255, 255, 255))  # default color is white
-            value = compute_emd_recursive(
-                constant_image,
-                ref_image,
-                threshold_most_frequent_color,
-                patch_size,
-                max_num_patches,
-                weight_most_frequent_color,
-                use_tqdm,
-            )
-            return {"value": value}
-        hash_dict = {
-            "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
-        }
-        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
-        cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
-        assert self._cache is not None
-        emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
-        emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
-        return 1.0 - emd_raw["value"] / emd_base["value"]
-    def compute_block_emd_median(
+    def ems(
         self,
         pred_image: Image.Image,
         ref_image: Image.Image,
@@ -493,9 +445,13 @@ class AnnotatedImageMetrics(Metric):
         def compute_denominator():
             ref_img_np = np.array(ref_image)
             (rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
+            grayscale_most_frequent_color = to_gray(rgb_most_frequent_color)[0]
             # Most frequent color as base
-            constant_image = Image.new("RGB", ref_image.size, tuple(rgb_most_frequent_color))  # type: ignore
+            if grayscale_most_frequent_color < 127:
+                constant_image = Image.new("RGB", ref_image.size, (255, 255, 255))  # Make it white
+            else:
+                constant_image = Image.new("RGB", ref_image.size, (0, 0, 0))  # Make it black
             value = compute_emd_recursive(
                 constant_image,
                 ref_image,
@@ -509,9 +465,10 @@ class AnnotatedImageMetrics(Metric):
         hash_dict = {
             "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
+            "generated_image": str(AnnotatedImageMetrics.HASH_FUNC(pred_image, hash_size=self.HASH_LENGTH)),
         }
-        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
-        cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2}", **hash_dict}
+        cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EMD}", **hash_dict}
+        cache_key_denominator = {"metric_name": "intermediate_ems_extreme_denominator", **hash_dict}
         assert self._cache is not None
         emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
@@ -542,8 +499,9 @@ class AnnotatedImageMetrics(Metric):
         hash_dict = {
             "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
+            "generated_image": str(AnnotatedImageMetrics.HASH_FUNC(pred_image, hash_size=self.HASH_LENGTH)),
         }
-        cache_key = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
+        cache_key = {"metric_name": f"intermediate_{self.BLOCK_EMD}", **hash_dict}
         assert self._cache is not None
         emd_raw, _ = self._cache.get(cache_key, compute)
@@ -560,8 +518,8 @@ class AnnotatedImageMetrics(Metric):
         use_tqdm: bool = False,
     ):
         """Computes the block Earth Moving Distance (EMD). This attempts to
-        speed up EMD for images with huge areas by considering movement/transformatio
-        of blocks of pixels. The score is normalized against EMD against white images
+        speed up EMD for images with huge areas by considering
+        movement/transformation of blocks of pixels.
         """
         emd_value = compute_emd_recursive(
             pred_image,

helm/benchmark/metrics/vision_language/image_utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ try:
     import cv2
     from PIL.Image import Image
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 def preprocess_image(image: Image) -> np.ndarray:

helm/benchmark/model_metadata_registry.py CHANGED Viewed

@@ -22,9 +22,6 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 # OpenAI Chat format
 OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
-# Mistral instruction-following format
-MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
 # For Anthropic models
 ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
 ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -69,6 +66,9 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
 LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
 FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
+# Deprecated models that are no longer available.
+# These are usually closed API models that have been permanently removed
+DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -1,6 +1,9 @@
+import ast
+import dataclasses
 from dataclasses import dataclass, field
 from typing import List, Optional, Dict
 import dacite
+from inspect import cleandoc
 import mako.template
 import yaml
 import importlib_resources as resources
@@ -17,6 +20,11 @@ SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
 SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
+_ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
+_ADAPTER_SPEC_FILENAME = "adapter_spec.py"
+_ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
 @dataclass(frozen=True)
 class Field:
     """
@@ -198,9 +206,6 @@ class RunGroup(Field):
 class Schema:
     """Specifies information about what to display on the frontend."""
-    # Adapter fields (e.g., temperature)
-    adapter: List[Field]
     # Information about each field
     metrics: List[Field]
@@ -213,6 +218,11 @@ class Schema:
     # Group the scenarios
     run_groups: List[RunGroup]
+    # Adapter fields (e.g., temperature)
+    # Automatically populated from the docstrings in the AdapterSpec class definition.
+    # Should not be specified in the user's YAML file.
+    adapter: Optional[List[Field]] = None
     def __post_init__(self):
         self.name_to_metric = {metric.name: metric for metric in self.metrics}
         self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
@@ -220,6 +230,43 @@ class Schema:
         self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
+def get_adapter_fields() -> List[Field]:
+    """Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
+    # Unfortunately there is no standard library support for getting docstrings of class fields,
+    # so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
+    adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
+    with open(adapter_spec_path, "r") as f:
+        contents = f.read()
+    module_node = ast.parse(contents)
+    adapter_spec_node = [
+        node
+        for node in ast.iter_child_nodes(module_node)
+        if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
+    ][0]
+    metadata_fields: List[Field] = []
+    field_name: str = ""
+    for node in ast.iter_child_nodes(adapter_spec_node):
+        if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
+            # This node is a field definition.
+            # Save the name of the field for later.
+            field_name = node.target.id
+        else:
+            # If this is a docstring that immediately follows a field definition,
+            # output an adapter field with the name set to  the field definition and
+            # the description set to the docstring.
+            if (
+                field_name
+                and isinstance(node, ast.Expr)
+                and isinstance(node.value, ast.Constant)
+                and isinstance(node.value.value, str)
+            ):
+                description = cleandoc(node.value.value).replace("\n", " ")
+                metadata_fields.append(Field(name=field_name, description=description))
+            field_name = ""
+    return metadata_fields
 def get_default_schema_path() -> str:
     return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
     hlog(f"Reading schema file {schema_path}...")
     with open(schema_path, "r") as f:
         raw = yaml.safe_load(f)
-    return dacite.from_dict(Schema, raw)
+    schema = dacite.from_dict(Schema, raw)
+    if schema.adapter:
+        hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
+    return dataclasses.replace(schema, adapter=get_adapter_fields())

helm/benchmark/presentation/test_run_entry.py CHANGED Viewed

@@ -16,6 +16,7 @@ class TestRunEntry:
     @pytest.mark.parametrize("fname", list_fnames())
     def test_read_all_specs(self, fname: str):
+        pytest.skip("Skipping slow tests")
         run_entries = read_run_entries([fname])
         for entry in run_entries.entries:
             construct_run_specs(parse_object_spec(entry.description))

helm/benchmark/presentation/test_schema.py ADDED Viewed

@@ -0,0 +1,11 @@
+from helm.benchmark.presentation.schema import get_adapter_fields
+def test_get_adapter_fields() -> None:
+    adapter_fields = get_adapter_fields()
+    assert adapter_fields
+    assert adapter_fields[0].name == "method"
+    assert (
+        adapter_fields[0].description
+        == "The high-level strategy for converting instances into a prompt for the language model."
+    )

helm/benchmark/run.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import argparse
 from dataclasses import replace
 import os
+import re
 from typing import List, Optional
+from helm.benchmark import model_metadata_registry
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
@@ -264,6 +266,13 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
+    parser.add_argument(
+        "--openvino",
+        action="store_true",
+        default=False,
+        help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
+        "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
+    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
@@ -275,12 +284,19 @@ def main():
         from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
         for huggingface_model_name in args.enable_huggingface_models:
-            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+            if args.openvino:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
+            else:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name)
     if args.enable_local_huggingface_models:
         from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
         for huggingface_model_path in args.enable_local_huggingface_models:
-            register_huggingface_local_model_from_flag_value(huggingface_model_path)
+            if args.openvino:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
+            else:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path)
     run_entries: List[RunEntry] = []
     if args.conf_paths:
@@ -300,6 +316,19 @@ def main():
     ensure_directory_exists(args.output_path)
     set_benchmark_output_path(args.output_path)
+    # Validate the --models-to-run flag
+    if args.models_to_run:
+        all_models = set(model_metadata_registry.get_all_models())
+        for model_to_run in args.models_to_run:
+            if model_to_run not in all_models:
+                raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
+    else:
+        model_expander_pattern = re.compile(
+            r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b"  # noqa: E501
+        )
+        if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
+            raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
     run_specs = run_entries_to_run_specs(
         run_entries=run_entries,
         max_eval_instances=args.max_eval_instances,

helm/benchmark/run_expander.py CHANGED Viewed

@@ -10,6 +10,7 @@ from helm.benchmark.model_metadata_registry import (
     get_all_text_models,
     get_model_metadata,
     get_model_names_with_tag,
+    DEPRECATED_MODEL_TAG,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
@@ -194,6 +195,15 @@ class StopRunExpander(RunExpander):
         self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if self.value == "none":
+            return [
+                replace(
+                    run_spec,
+                    name=f"{run_spec.name},{self.name}={self.value}",
+                    adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
+                ),
+            ]
         if self.value == "hash":
             stop = "###"
         elif self.value == "semicolon":
@@ -334,16 +344,6 @@ class AnthropicClaude3RunExpander(RunExpander):
             run_spec,
             adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
         )
-        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
-            instructions = "Answer with only a single letter."
-            if run_spec.adapter_spec.instructions:
-                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
-            return [
-                replace(
-                    run_spec,
-                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
-                ),
-            ]
         return [run_spec]
@@ -601,6 +601,12 @@ class ModelRunExpander(ReplaceValueRunExpander):
                 values_dict["ablation"] = models
             else:
                 values_dict[family_name] = models
+        # For each of the keys above, filter out deprecated models.
+        deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
+        for family_name in values_dict.keys():
+            values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
         return values_dict
@@ -1035,6 +1041,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
     "chinese": {"chinese": [translate(language_code="zh-CN")]},
     "hindi": {"hindi": [translate(language_code="hi")]},
     "spanish": {"spanish": [translate(language_code="es")]},
+    "swahili": {"swahili": [translate(language_code="sw")]},
     # Styles
     "art": {
         "art": [
@@ -1380,6 +1387,101 @@ class ChatMLRunExpander(RunExpander):
         ]
+class OutputFormatInstructions(RunExpander):
+    """Add extra instructions to about output formatting to HELM Lite scenarios.
+    Many instruction-following models and chat models are tuned to expect conversational prompts
+    and respond in a conversational way. These models occasionally produce outputs that are not
+    in the expected format. This run expander instructs these models to provide the output in
+    the format expected by the scenario.
+    The argument should be the name of the scenario."""
+    name = "output_format_instructions"
+    _SUFFIX_SUFFIX = "_suffix"
+    def __init__(self, scenario: str):
+        if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
+            self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
+            self.suffix = True
+        else:
+            self.scenario = scenario
+            self.suffix = False
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
+            if self.scenario == "mmlu_only_last_question":
+                instructions = "Answer only the last question with only a single letter."
+            elif self.scenario == "mmlu":
+                instructions = "Answer with only a single letter."
+            elif self.scenario == "mcqa":
+                instructions = "Answer with only a single letter."
+            else:
+                instructions = "Answer with only a single letter."
+        elif run_spec.adapter_spec.method == ADAPT_GENERATION:
+            output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
+            if self.scenario == "narrative_qa":
+                instructions = (
+                    "Answer with one word, a few-word phrase, or a short sentence. "
+                    + "Avoid extra, unnecessary information in the answer."
+                )
+            elif self.scenario == "natural_qa":
+                instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
+            elif self.scenario == "legalbench":
+                if output_noun != "Answer":
+                    instructions = f"Answer with the {output_noun.lower()}."
+                else:
+                    instructions = "Answer yes or no."
+            elif self.scenario == "legalbench_abercrombie":
+                instructions = "Answer with only 'generic', 'descriptive', 'suggestive', 'arbitrary' or 'fanciful'."
+            elif self.scenario == "legalbench_function_of_decision_section":
+                instructions = "Answer with only 'Facts', 'Procedural History', 'Issue', 'Rule', 'Analysis', 'Conclusion' or 'Decree'."  # noqa: E501
+            elif self.scenario == "legalbench_yes_or_no":
+                instructions = "Answer with only 'Yes' or 'No'."
+            elif self.scenario == "wmt_14":
+                instructions = "Answer with the English translation."
+            elif self.scenario == "wmt_14_only_last_sentence":
+                instructions = "Answer with only the English translation for the last sentence."
+            elif self.scenario == "math":
+                instructions = "Wrap the final answer with the \\boxed{} command."
+            elif self.scenario == "numeric_nlg":
+                instructions = "Answer with only description of the last table as a single paragraph on a single line."
+            elif self.scenario == "tab_fact":
+                instructions = (
+                    "Answer with only the classification of the last statement, either 'refuted' or 'entailed'."
+                )
+            elif self.scenario == "wikitq":
+                instructions = (
+                    "Answer only the last question with a short answer. "
+                    "Avoid extra, unnecessary information in the answer."
+                )
+            else:
+                raise ValueError(f"Unknown scenario {self.scenario}")
+        if self.suffix:
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(
+                        run_spec.adapter_spec,
+                        global_suffix=f"{run_spec.adapter_spec.global_suffix}\n\n{instructions}",
+                    ),
+                ),
+            ]
+        if run_spec.adapter_spec.instructions:
+            instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+        else:
+            instructions = f"{instructions}\n"
+        return [
+            replace(
+                run_spec,
+                adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+            ),
+        ]
 RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     InstructionsRunExpander,
     PromptRunExpander,
@@ -1402,6 +1504,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     NumOutputTokensRunExpander,
     ChatMLRunExpander,
     EvalSplitRunExpander,
+    OutputFormatInstructions,
 ]

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -156,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
             increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
             run_spec = singleton(increase_temperature_expander.expand(run_spec))
+        # MedLM-Large
+        if run_spec.adapter_spec.model == "google/medlm-large":
+            run_spec = singleton(StopRunExpander("none").expand(run_spec))
         return run_spec
     run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]

helm/benchmark/run_specs/air_bench_run_specs.py ADDED Viewed

@@ -0,0 +1,40 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("air_bench_2024")
+def get_air_bench_2024_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="air_bench_2024",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["air_bench_2024"],
+    )

crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl