PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
helm/benchmark/adaptation/common_adapter_specs.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
helm/benchmark/annotation/call_center_annotator.py +247 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +68 -0
helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
helm/benchmark/annotation/live_qa_annotator.py +71 -0
helm/benchmark/annotation/medication_qa_annotator.py +68 -0
helm/benchmark/annotation/model_as_judge.py +45 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
helm/benchmark/annotation/xstest_annotator.py +110 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/bhasa_metrics.py +188 -0
helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
helm/benchmark/metrics/code_metrics_helper.py +11 -1
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/safety_metrics.py +57 -0
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
helm/benchmark/metrics/vision_language/image_utils.py +1 -1
helm/benchmark/model_metadata_registry.py +3 -3
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_run_entry.py +1 -0
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +31 -2
helm/benchmark/run_expander.py +113 -10
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
helm/benchmark/run_specs/call_center_run_specs.py +152 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
helm/benchmark/run_specs/experimental_run_specs.py +85 -0
helm/benchmark/run_specs/finance_run_specs.py +110 -0
helm/benchmark/run_specs/safety_run_specs.py +154 -0
helm/benchmark/run_specs/vlm_run_specs.py +251 -57
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
helm/benchmark/scenarios/banking77_scenario.py +51 -0
helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
helm/benchmark/scenarios/financebench_scenario.py +53 -0
helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
helm/benchmark/scenarios/scenario.py +1 -1
helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +2 -8
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +1 -6
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_bhasa.yaml +709 -0
helm/benchmark/static/schema_call_center.yaml +232 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +189 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_safety.yaml +247 -0
helm/benchmark/static/schema_tables.yaml +317 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/test_openai_window_service.py +8 -8
helm/clients/ai21_client.py +71 -1
helm/clients/anthropic_client.py +50 -28
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +79 -19
helm/clients/nvidia_nim_client.py +35 -0
helm/clients/openai_client.py +11 -5
helm/clients/palmyra_client.py +25 -0
helm/clients/perspective_api_client.py +11 -6
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +7 -9
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/open_flamingo_client.py +1 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +99 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +25 -0
helm/common/mongo_key_value_store.py +2 -1
helm/common/request.py +16 -0
helm/config/model_deployments.yaml +740 -363
helm/config/model_metadata.yaml +824 -128
helm/config/tokenizer_configs.yaml +207 -10
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/example_queries.py +14 -21
helm/proxy/services/server_service.py +2 -3
helm/proxy/token_counters/test_auto_token_counter.py +2 -2
helm/tokenizers/ai21_tokenizer.py +51 -59
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +29 -62
helm/tokenizers/huggingface_tokenizer.py +35 -13
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/general.js +0 -122
helm/benchmark/static/images/crfm-logo.png +0 -0
helm/benchmark/static/images/helm-logo-simple.png +0 -0
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/info-icon.png +0 -0
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/schema_image2structure.yaml +0 -304
helm/benchmark/static/utils.js +0 -285
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
helm/benchmark/window_services/ai21_window_service.py +0 -247
helm/benchmark/window_services/cohere_window_service.py +0 -101
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -75
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -327
helm/tokenizers/ice_tokenizer.py +0 -30
helm/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0

helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} RENAMED Viewed

@@ -22,6 +22,10 @@ from helm.common.general import ensure_directory_exists
 from helm.common.hierarchical_logger import hlog
 PROCESSED: str = "processed"
+DIFFICULTY_ALL = "all"
+DIFFICULTY_EASY = "easy"
+DIFFICULTY_MEDIUM = "medium"
+DIFFICULTY_HARD = "hard"
 class Image2StructureScenario(Scenario):
@@ -38,13 +42,16 @@ class Image2StructureScenario(Scenario):
         VALID_SPLIT: "validation",
     }
-    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
+    def __init__(
+        self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT, difficulty: str = DIFFICULTY_ALL
+    ):
         super().__init__()
         assert subset in self.SUBSETS, f"Invalid subset: {subset}"
         self._subset: str = subset
         self._recompile_prompt: bool = recompile_prompt
         self._split: str = split
         self._output_path: Optional[str] = None
+        self._difficulty: str = difficulty
     def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
         # By default, there are no assets
@@ -110,6 +117,10 @@ class Image2StructureScenario(Scenario):
                 )
                 continue
+            # Filter by difficulty
+            if self._difficulty != DIFFICULTY_ALL and row["difficulty"] != self._difficulty:
+                continue
             # Step 1: Preprocess the row
             row = self.preprocess_row(row, assets_path)
@@ -158,7 +169,7 @@ class Image2StructureScenario(Scenario):
                     # representing the structure (such as LaTeX code)
                     multimedia_object = MultimediaObject([image_object])
                 reference = Reference(
-                    output=Output(text=row["text"], multimedia_content=multimedia_object),
+                    output=Output(text=row["text"] if "text" in row else "", multimedia_content=multimedia_object),
                     tags=[CORRECT_TAG],
                 )
             else:

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py RENAMED Viewed

@@ -1,22 +1,18 @@
-from helm.benchmark.scenarios.scenario import VALID_SPLIT
-from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
+from helm.benchmark.scenarios.vision_language.image2struct.utils_latex import (
     latex_to_image,
     strip_unnecessary_latex_parts,
 )
-from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
+from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
 class LatexScenario(Image2StructureScenario):
     BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters."  # noqa: E501
     HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
-    SUBSETS = ["equation", "table", "plot", "algorithm"]
+    SUBSETS = ["equation", "table", "plot", "algorithm", "wild", "wild_legacy"]
     name = "image2latex"
     description = "Evaluate multimodal models on Latex generation to recreate a provided image"
-    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
-        super().__init__(subset, recompile_prompt, split)
     def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
         image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
         image.save(destination_path)

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py RENAMED Viewed

@@ -1,5 +1,4 @@
-from helm.benchmark.scenarios.scenario import VALID_SPLIT
-from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
+from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
 class MusicSheetScenario(Image2StructureScenario):
@@ -13,8 +12,5 @@ class MusicSheetScenario(Image2StructureScenario):
     name = "image2musicsheet"
     description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
-    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
-        super().__init__(subset, recompile_prompt, split)
     def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
         raise Exception("Music sheets have no ground truth, compilation is not possible")

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py RENAMED Viewed

@@ -5,6 +5,7 @@ import os
 import re
 from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
+from helm.common.hierarchical_logger import hlog
 try:
     from latex import build_pdf
@@ -12,14 +13,13 @@ try:
     from PIL import ImageOps
     from PIL.Image import Image
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 # LaTeX preamble
 # Make sure to install "latex-full".
 TEX_INCLUDES = r"""
 \usepackage{amsmath,amssymb,amsfonts}
 \usepackage{graphicx}
-\usepackage{graphicx}
 \usepackage{amsmath}
 \usepackage{xcolor}
 \usepackage{algorithm}
@@ -98,23 +98,19 @@ def pdf_to_image(
 def strip_unnecessary_latex_parts(latex_code: str) -> str:
     """Strip unnecessary parts of the LaTeX code."""
     # Remove comments
     minimal_latex_code = re.sub(r"%.*?\n", "\n", latex_code)
     # Remove \documentclass and any \usepackage lines
-    minimal_latex_code = re.sub(r"\\documentclass\{.*?\}\n", "", latex_code)
-    minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}\n", "", minimal_latex_code)
+    minimal_latex_code = re.sub(r"\\documentclass(\[.*?\])?\{.*?\}", "", latex_code)
+    minimal_latex_code = re.sub(r"\\documentstyle(\[.*?\])?\{.*?\}", "", minimal_latex_code)
+    minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}", "", minimal_latex_code)
     # Remove everything before \begin{document} and including it, and everything after \end{document}
     minimal_latex_code = re.sub(r"\\begin\{document\}\n*", "", minimal_latex_code, flags=re.DOTALL)
     minimal_latex_code = re.sub(r"\\end\{document\}.*", "", minimal_latex_code, flags=re.DOTALL)
     # Ensure \begin{...} is followed by a \n
     minimal_latex_code = re.sub(r"(\\begin\{.*?\}(\[.*?\])?)(?!\n)", r"\1\n", minimal_latex_code)
     # Ensure \end{...} has a \n before it
     minimal_latex_code = re.sub(r"(\\end\{.*?\})(?!\n)", r"\1\n", minimal_latex_code)
     # Normalize space sequences to a single space globally
     minimal_latex_code = re.sub(r" +", " ", minimal_latex_code)
     # Replace tabs with a single space
@@ -123,7 +119,6 @@ def strip_unnecessary_latex_parts(latex_code: str) -> str:
     minimal_latex_code = re.sub(r"^[ \t]+|[ \t]+$", "", minimal_latex_code, flags=re.MULTILINE)
     # Remove unnecessary whitespace - multiple empty lines and tabulations
     minimal_latex_code = re.sub(r"\n\s*\n", "\n", minimal_latex_code)
     return minimal_latex_code.strip()
@@ -226,25 +221,21 @@ def handle_latex_error(
         # Error format: "LaTeX Error: Environment <env> undefined."
         undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
         if undefined_search:
-            # If a package is missing and this is our first retry, then simply include TEX_INCLUDES
-            if num_try_remaining == MAX_NUM_TRIES:
-                fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
-            if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
-                # Here we try to manually solve the missing environment.
-                # This is either executed on the second rety or the first if no changements
-                # were made in the first retry.
-                assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
-                # TEX_INCLUDES is already present, so we add the missing package
-                # Since we cannot know the name of the package that contains the missing environment,
-                # we simply hope that they are named the same way.
-                env_undefined: str = undefined_search.group(1)
-                if f"\\usepackage{{{env_undefined}}}" in fixed_code:
-                    # We already tried to include the missing package, but it probably
-                    # does not exist, so we raise an error
-                    raise RuntimeError(str(e)) from e
-                fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
+            # Here we try to manually solve the missing environment.
+            # This is either executed on the second rety or the first if no changements
+            # were made in the first retry.
+            assert TEX_INCLUDES in fixed_code, f"TEX_INCLUDES should be present in the code. code={fixed_code}"
+            # TEX_INCLUDES is already present, so we add the missing package
+            # Since we cannot know the name of the package that contains the missing environment,
+            # we simply hope that they are named the same way.
+            env_undefined: str = undefined_search.group(1)
+            if f"\\usepackage{{{env_undefined}}}" in fixed_code:
+                # We already tried to include the missing package, but it probably
+                # does not exist, so we raise an error
+                raise RuntimeError(str(e)) from e
+            fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
         # Try again with the fixed code (if the fixed code is different from the original code)
         if fixed_code != original_latex_code:
@@ -313,23 +304,24 @@ def latex_to_image(
     # 2. Add preamble
     # 2.1. Remove \documentclass if present to make sure we use our own
-    documentclass_search = re.search(r"\\documentclass\{(.*)\}", original_latex_code)
+    documentclass_search = re.search(r"\\documentclass(\[.*?\])?\{.*?\}", original_latex_code)
+    documentstyle_search = re.search(r"\\documentstyle(\[.*?\])?\{.*?\}", original_latex_code)
     if documentclass_search:
-        documentclass: str = documentclass_search.group(1)
-        original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
+        matching_string = documentclass_search.group()
+        original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
+    elif documentstyle_search:
+        matching_string = documentstyle_search.group()
+        original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
     else:
         # If there is no \documentclass, we add our own
         original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code
-    # 2.2. Add includes. In this first step, we only add includes if none are present.
-    # We do this because if some are present, we might define them twice which can cause errors
-    # and this section should not make the original LaTeX code fail if it was compilable.
-    # If there are missing packages, in handle_latex_error, we will add TEX_INCLUDES after the begin document,
-    # which might define some packages twice, but often solves the problem.
-    if not re.search(r"\\usepackage\{.*\}", original_latex_code):
-        original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
+    # 2.2. Add includes. In this ste we remove all includes for the default ones.
+    original_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*\}", "", original_latex_code)
+    original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
     latex_code: str = original_latex_code
+    hlog(f"Compiling LaTeX code:\n{latex_code}")
     try:
         pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
         image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py RENAMED Viewed

@@ -6,7 +6,7 @@ try:
     from selenium import webdriver
     import selenium.common.exceptions
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py RENAMED Viewed

@@ -5,7 +5,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
 try:
     from html2text import HTML2Text
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 def convert_html_to_text(handler: HTML2Text, html: str) -> str:

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py RENAMED Viewed

@@ -1,23 +1,26 @@
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Optional
+from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
 from helm.benchmark.scenarios.scenario import VALID_SPLIT
-from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
+from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
     Image2StructureScenario,
     PROCESSED,
+    DIFFICULTY_ALL,
 )
-from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
-from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
+from helm.benchmark.scenarios.vision_language.image2struct.webpage.jekyll_server import JekyllServer
+from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import (
     save_random_screenshot,
     ScreenshotOptions,
 )
-from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
+from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
 from helm.common.general import ensure_directory_exists
 from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.hierarchical_logger import hlog
 try:
     from html2text import HTML2Text
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 import base64
@@ -72,28 +75,48 @@ def serve_and_take_screenshot(
     if not success:
         # This runs on examples that are not expected to fail
         server.stop()
+        hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
         raise ValueError(f"Jekyll server failed to start: {repo_path}")
     # Take a screenshot of a random page
     success = False
-    error: Exception
-    for _ in range(max_tries):
+    error: Optional[Exception] = None
+    MAX_TRIES_ALL_ERRORS = 3
+    MAX_TRIES_CONNECTION_REFUSED = 5
+    MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
+    for compilation_attempt in range(MAX_TRIES):
         try:
             infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
             success = True
             break
         except Exception as e:
-            if "net::ERR_CONNECTION_REFUSED" in str(e):
-                error = e
+            error = e
+            if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
+                hlog(
+                    f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
+                    f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
+                )
                 server.stop()
                 time.sleep(0.5)
                 server.start()
                 time.sleep(0.5)
+            elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
+                hlog(
+                    f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
+                    f" Error: {e}. Retrying..."
+                )
             else:
                 # Do not retry
+                hlog(
+                    f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
+                    f" Error: {e}. Raising CompilationError."
+                )
                 break
     if not success:
-        raise ValueError(f"Failed to take a screenshot: {error}")
+        raise CompilationError(f"Failed to take a screenshot: {error}")
     # Stop the server
     server.stop()
@@ -128,7 +151,7 @@ class WebpageScenario(Image2StructureScenario):
     )
     HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
-    SUBSETS = ["css", "html", "javascript"]
+    SUBSETS = ["css", "html", "javascript", "wild", "wild_legacy"]
     MAX_TRIES: int = 5
     ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
@@ -140,9 +163,10 @@ class WebpageScenario(Image2StructureScenario):
         subset: str,
         recompile_prompt: bool = True,
         split: str = VALID_SPLIT,
+        difficulty: str = DIFFICULTY_ALL,
         screenshot_options: ScreenshotOptions = ScreenshotOptions(),
     ):
-        super().__init__(subset, recompile_prompt, split)
+        super().__init__(subset, recompile_prompt, split, difficulty)
         self._screenshot_options = screenshot_options
         self._html2text = HTML2Text()
         self._html2text.ignore_links = True
@@ -165,6 +189,13 @@ class WebpageScenario(Image2StructureScenario):
                 shutil.rmtree(assets_save_path)
         ensure_directory_exists(assets_save_path)
+        if "wild" in self._subset:
+            # There is no stucture
+            del row["assets"]
+            row["assets_paths"] = []
+            row["assets_names"] = []
+            return row
         # Structure is a base64 encoding of the repo
         if self._output_path is None:
             raise ValueError("Output path not set")

helm/benchmark/scenarios/vision_language/math_vista_scenario.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MathVistaScenario(Scenario):
     name = "math_vista"
     description = (
         "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
-        "([paper](https://arxiv.org/abs/2310.02255))."
+        "([Lu et al., 2024](https://arxiv.org/abs/2310.02255))."
     )
     tags = ["vision-language", "reasoning", "math"]

helm/benchmark/scenarios/vision_language/mementos_scenario.py CHANGED Viewed

@@ -38,10 +38,10 @@ class MementosScenario(Scenario):
     Paper: https://arxiv.org/abs/2401.10529
     """
-    MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
+    MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "RussWang96/unofficial_mementos_dataset"
     IMAGE_URL: str = (
-        "https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
+        "https://huggingface.co/datasets/RussWang96/unofficial_mementos_dataset/resolve/main/"
         + "{subject}/{split}/{file_name}?download=true"
     )
@@ -56,7 +56,7 @@ class MementosScenario(Scenario):
     name = "mementos"
     description = (
         "A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
-        " ([paper](https://arxiv.org/abs/2401.10529))."
+        " ([Wang et al., 2024](https://arxiv.org/abs/2401.10529))."
     )
     tags = ["vision-language"]

helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py CHANGED Viewed

@@ -48,14 +48,14 @@ class MMSafetyBenchScenario(Scenario):
     }
     QUESTIONS_URL_TEMPLATE: str = (
-        "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
+        "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/processed_questions/{dataset}.json"
     )
     IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
     name = "mm_safety_bench"
     description = (
         "Expose the vulnerability of open-source VLMs with toxic and biased content "
-        "([paper](https://arxiv.org/abs/2311.17600))."
+        "([Liu et al., 2023](https://arxiv.org/abs/2311.17600))."
     )
     tags = ["vision-language", "bias", "toxicity"]

helm/benchmark/scenarios/vision_language/mme_scenario.py CHANGED Viewed

@@ -19,22 +19,22 @@ from helm.common.general import ensure_directory_exists
 class MMEScenario(Scenario):
     """
-        MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
-        Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
-        multimodal tasks, showing amazing emergent abilities in recent studies. However,
-        it is difficult for these case studies to fully reflect the performance of MLLM,
-        lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
-        the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
-        and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
-        that may arise from direct use of public datasets for evaluation, the annotations
-        of instruction-answer pairs are all manually designed. The concise instruction design
-        allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
-        Besides, with such an instruction, we can also easily carry out quantitative
-        statistics. We rephrase the answer type of MME to multiple-choice question-answering.
-        We use the multiple-choice metrics for 14 different evaluation tasks.
-        @article{fu2023mme,
+    MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
+    Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
+    multimodal tasks, showing amazing emergent abilities in recent studies. However,
+    it is difficult for these case studies to fully reflect the performance of MLLM,
+    lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
+    the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
+    and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
+    that may arise from direct use of public datasets for evaluation, the annotations
+    of instruction-answer pairs are all manually designed. The concise instruction design
+    allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
+    Besides, with such an instruction, we can also easily carry out quantitative
+    statistics. We rephrase the answer type of MME to multiple-choice question-answering.
+    We use the multiple-choice metrics for 14 different evaluation tasks.
+    @article{fu2023mme,
         title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
         author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
         Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
@@ -43,7 +43,7 @@ class MMEScenario(Scenario):
         year={2023}
     }
-        Paper: https://arxiv.org/abs/2306.13394
+    Paper: https://arxiv.org/abs/2306.13394
     """
     MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
@@ -66,7 +66,10 @@ class MMEScenario(Scenario):
     ]
     name = "mme"
-    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2306.13394))."
+    description = (
+        "Evaluate multimodal models on their perception and cognition abilities on a total of 14 subtasks "
+        "([Fu et al., 2023](https://arxiv.org/abs/2306.13394))."
+    )
     tags = ["vision-language"]
     options: List[str] = ["Yes", "No"]

helm/benchmark/scenarios/vision_language/mmmu_scenario.py CHANGED Viewed

@@ -81,7 +81,7 @@ class MMMUScenario(Scenario):
     name = "mmmu"
     description = (
         "Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
-        "subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
+        "subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502))."
     )
     tags = ["vision-language"]

helm/benchmark/scenarios/vision_language/pairs_scenario.py CHANGED Viewed

@@ -19,7 +19,7 @@ class PAIRSScenario(Scenario):
     """
     Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
-    Modified to ensure there is no ambiguity regarding the preferred choice for each question.
+    Modified to add an option to opt-out with "unclear" as a choice.
     @misc{fraser2024examining,
           title={Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel
@@ -186,7 +186,7 @@ class PAIRSScenario(Scenario):
     name = "pairs"
     description = (
         "Examining gender and racial bias in VLMs Using a Novel Dataset of Parallel Images. "
-        "([paper](https://arxiv.org/abs/2402.05779))."
+        "([Fraser et al., 2024](https://arxiv.org/abs/2402.05779))."
     )
     tags = ["vision-language", "bias"]
@@ -232,13 +232,14 @@ class PAIRSScenario(Scenario):
                     MediaObject(location=local_image_path, content_type="image/png"),
                     MediaObject(text=question.text, content_type="text/plain"),
                 ]
+                references = [Reference(Output(text=choice), tags=[]) for i, choice in enumerate(question.choices)]
+                # Add the preferred choice "unclear" as the correct answer
+                references.append(Reference(Output(text="unclear"), tags=[CORRECT_TAG]))
                 instances.append(
                     Instance(
                         Input(multimedia_content=MultimediaObject(content)),
-                        references=[
-                            Reference(Output(text=choice), tags=[CORRECT_TAG] if i == question.preferred_choice else [])
-                            for i, choice in enumerate(question.choices)
-                        ],
+                        references=references,
                         split=TEST_SPLIT,
                     )
                 )

helm/benchmark/scenarios/vision_language/pope_scenario.py CHANGED Viewed

@@ -42,7 +42,8 @@ class POPEScenario(Scenario):
     name = "pope"
     description = (
-        "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
+        "Open-ended questions about hallucination images "
+        "([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20/))."
     )
     tags = ["vision-language", "visual question answering"]
     options: List[str] = ["Yes", "No"]

helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py ADDED Viewed

@@ -0,0 +1,57 @@
+from typing import List
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+class RealWorldQAScenario(Scenario):
+    """
+    RealWorldQA is a benchmark designed for real-world understanding. The dataset consists of anonymized
+    images taken from vehicles, in addition to other real-world images.
+    Blog post: https://x.ai/blog/grok-1.5v
+    Website: https://huggingface.co/datasets/xai-org/RealworldQA
+    """
+    HUGGINGFACE_DATASET_NAME: str = "xai-org/RealworldQA"
+    name = "real_world_qa"
+    description = (
+        "A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models "
+        "([xAI, 2024](https://x.ai/blog/grok-1.5v))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=TEST_SPLIT, cache_dir=output_path)):
+            # Save the image to disk
+            image = row["image"]
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type="image/jpeg"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=TEST_SPLIT)
+            )
+        return instances

helm/benchmark/scenarios/vision_language/seed_bench_scenario.py CHANGED Viewed

@@ -35,10 +35,10 @@ class SEEDBenchScenario(Scenario):
     the multiple-choice metric for evaluating the performance of models.
     @article{li2023seed,
-    title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
-    author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
-    journal={arXiv preprint arXiv:2307.16125},
-    year={2023}
+        title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
+        author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
+        journal={arXiv preprint arXiv:2307.16125},
+        year={2023}
     }
     Paper: https://arxiv.org/abs/2307.16125
@@ -59,7 +59,9 @@ class SEEDBenchScenario(Scenario):
     }
     name = "seed_bench"
-    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2307.16125))."
+    description = (
+        "Evaluate multimodal models on 9 evaluation aspects " "([Li et al., 2023](https://arxiv.org/abs/2307.16125))."
+    )
     tags = ["vision-language"]
     def __init__(self, subject: str):

crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl