PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py RENAMED Viewed

@@ -1,24 +1,26 @@
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Optional
+from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
 from helm.benchmark.scenarios.scenario import VALID_SPLIT
-from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
+from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
     Image2StructureScenario,
     PROCESSED,
     DIFFICULTY_ALL,
 )
-from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
-from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
+from helm.benchmark.scenarios.vision_language.image2struct.webpage.jekyll_server import JekyllServer
+from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import (
     save_random_screenshot,
     ScreenshotOptions,
 )
-from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
+from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
 from helm.common.general import ensure_directory_exists
 from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.hierarchical_logger import hlog
 try:
     from html2text import HTML2Text
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 import base64
@@ -73,28 +75,48 @@ def serve_and_take_screenshot(
     if not success:
         # This runs on examples that are not expected to fail
         server.stop()
+        hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
         raise ValueError(f"Jekyll server failed to start: {repo_path}")
     # Take a screenshot of a random page
     success = False
-    error: Exception
-    for _ in range(max_tries):
+    error: Optional[Exception] = None
+    MAX_TRIES_ALL_ERRORS = 3
+    MAX_TRIES_CONNECTION_REFUSED = 5
+    MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
+    for compilation_attempt in range(MAX_TRIES):
         try:
             infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
             success = True
             break
         except Exception as e:
-            if "net::ERR_CONNECTION_REFUSED" in str(e):
-                error = e
+            error = e
+            if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
+                hlog(
+                    f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
+                    f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
+                )
                 server.stop()
                 time.sleep(0.5)
                 server.start()
                 time.sleep(0.5)
+            elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
+                hlog(
+                    f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
+                    f" Error: {e}. Retrying..."
+                )
             else:
                 # Do not retry
+                hlog(
+                    f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
+                    f" Error: {e}. Raising CompilationError."
+                )
                 break
     if not success:
-        raise ValueError(f"Failed to take a screenshot: {error}")
+        raise CompilationError(f"Failed to take a screenshot: {error}")
     # Stop the server
     server.stop()
@@ -129,7 +151,7 @@ class WebpageScenario(Image2StructureScenario):
     )
     HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
-    SUBSETS = ["css", "html", "javascript", "real"]
+    SUBSETS = ["css", "html", "javascript", "wild", "wild_legacy"]
     MAX_TRIES: int = 5
     ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
@@ -167,6 +189,13 @@ class WebpageScenario(Image2StructureScenario):
                 shutil.rmtree(assets_save_path)
         ensure_directory_exists(assets_save_path)
+        if "wild" in self._subset:
+            # There is no stucture
+            del row["assets"]
+            row["assets_paths"] = []
+            row["assets_names"] = []
+            return row
         # Structure is a base64 encoding of the repo
         if self._output_path is None:
             raise ValueError("Output path not set")

helm/benchmark/scenarios/vision_language/math_vista_scenario.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MathVistaScenario(Scenario):
     name = "math_vista"
     description = (
         "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
-        "([paper](https://arxiv.org/abs/2310.02255))."
+        "([Lu et al., 2024](https://arxiv.org/abs/2310.02255))."
     )
     tags = ["vision-language", "reasoning", "math"]

helm/benchmark/scenarios/vision_language/mementos_scenario.py CHANGED Viewed

@@ -38,10 +38,10 @@ class MementosScenario(Scenario):
     Paper: https://arxiv.org/abs/2401.10529
     """
-    MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
+    MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "RussWang96/unofficial_mementos_dataset"
     IMAGE_URL: str = (
-        "https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
+        "https://huggingface.co/datasets/RussWang96/unofficial_mementos_dataset/resolve/main/"
         + "{subject}/{split}/{file_name}?download=true"
     )
@@ -56,7 +56,7 @@ class MementosScenario(Scenario):
     name = "mementos"
     description = (
         "A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
-        " ([paper](https://arxiv.org/abs/2401.10529))."
+        " ([Wang et al., 2024](https://arxiv.org/abs/2401.10529))."
     )
     tags = ["vision-language"]

helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py CHANGED Viewed

@@ -48,14 +48,14 @@ class MMSafetyBenchScenario(Scenario):
     }
     QUESTIONS_URL_TEMPLATE: str = (
-        "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
+        "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/processed_questions/{dataset}.json"
     )
     IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
     name = "mm_safety_bench"
     description = (
         "Expose the vulnerability of open-source VLMs with toxic and biased content "
-        "([paper](https://arxiv.org/abs/2311.17600))."
+        "([Liu et al., 2023](https://arxiv.org/abs/2311.17600))."
     )
     tags = ["vision-language", "bias", "toxicity"]

helm/benchmark/scenarios/vision_language/mme_scenario.py CHANGED Viewed

@@ -19,22 +19,22 @@ from helm.common.general import ensure_directory_exists
 class MMEScenario(Scenario):
     """
-        MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
-        Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
-        multimodal tasks, showing amazing emergent abilities in recent studies. However,
-        it is difficult for these case studies to fully reflect the performance of MLLM,
-        lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
-        the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
-        and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
-        that may arise from direct use of public datasets for evaluation, the annotations
-        of instruction-answer pairs are all manually designed. The concise instruction design
-        allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
-        Besides, with such an instruction, we can also easily carry out quantitative
-        statistics. We rephrase the answer type of MME to multiple-choice question-answering.
-        We use the multiple-choice metrics for 14 different evaluation tasks.
-        @article{fu2023mme,
+    MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
+    Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
+    multimodal tasks, showing amazing emergent abilities in recent studies. However,
+    it is difficult for these case studies to fully reflect the performance of MLLM,
+    lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
+    the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
+    and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
+    that may arise from direct use of public datasets for evaluation, the annotations
+    of instruction-answer pairs are all manually designed. The concise instruction design
+    allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
+    Besides, with such an instruction, we can also easily carry out quantitative
+    statistics. We rephrase the answer type of MME to multiple-choice question-answering.
+    We use the multiple-choice metrics for 14 different evaluation tasks.
+    @article{fu2023mme,
         title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
         author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
         Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
@@ -43,7 +43,7 @@ class MMEScenario(Scenario):
         year={2023}
     }
-        Paper: https://arxiv.org/abs/2306.13394
+    Paper: https://arxiv.org/abs/2306.13394
     """
     MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
@@ -66,7 +66,10 @@ class MMEScenario(Scenario):
     ]
     name = "mme"
-    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2306.13394))."
+    description = (
+        "Evaluate multimodal models on their perception and cognition abilities on a total of 14 subtasks "
+        "([Fu et al., 2023](https://arxiv.org/abs/2306.13394))."
+    )
     tags = ["vision-language"]
     options: List[str] = ["Yes", "No"]

helm/benchmark/scenarios/vision_language/mmmu_scenario.py CHANGED Viewed

@@ -81,7 +81,7 @@ class MMMUScenario(Scenario):
     name = "mmmu"
     description = (
         "Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
-        "subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
+        "subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502))."
     )
     tags = ["vision-language"]

helm/benchmark/scenarios/vision_language/pairs_scenario.py CHANGED Viewed

@@ -186,7 +186,7 @@ class PAIRSScenario(Scenario):
     name = "pairs"
     description = (
         "Examining gender and racial bias in VLMs Using a Novel Dataset of Parallel Images. "
-        "([paper](https://arxiv.org/abs/2402.05779))."
+        "([Fraser et al., 2024](https://arxiv.org/abs/2402.05779))."
     )
     tags = ["vision-language", "bias"]

helm/benchmark/scenarios/vision_language/pope_scenario.py CHANGED Viewed

@@ -42,7 +42,8 @@ class POPEScenario(Scenario):
     name = "pope"
     description = (
-        "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
+        "Open-ended questions about hallucination images "
+        "([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20/))."
     )
     tags = ["vision-language", "visual question answering"]
     options: List[str] = ["Yes", "No"]

helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py ADDED Viewed

@@ -0,0 +1,57 @@
+from typing import List
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+class RealWorldQAScenario(Scenario):
+    """
+    RealWorldQA is a benchmark designed for real-world understanding. The dataset consists of anonymized
+    images taken from vehicles, in addition to other real-world images.
+    Blog post: https://x.ai/blog/grok-1.5v
+    Website: https://huggingface.co/datasets/xai-org/RealworldQA
+    """
+    HUGGINGFACE_DATASET_NAME: str = "xai-org/RealworldQA"
+    name = "real_world_qa"
+    description = (
+        "A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models "
+        "([xAI, 2024](https://x.ai/blog/grok-1.5v))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=TEST_SPLIT, cache_dir=output_path)):
+            # Save the image to disk
+            image = row["image"]
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type="image/jpeg"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=TEST_SPLIT)
+            )
+        return instances

helm/benchmark/scenarios/vision_language/seed_bench_scenario.py CHANGED Viewed

@@ -35,10 +35,10 @@ class SEEDBenchScenario(Scenario):
     the multiple-choice metric for evaluating the performance of models.
     @article{li2023seed,
-    title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
-    author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
-    journal={arXiv preprint arXiv:2307.16125},
-    year={2023}
+        title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
+        author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
+        journal={arXiv preprint arXiv:2307.16125},
+        year={2023}
     }
     Paper: https://arxiv.org/abs/2307.16125
@@ -59,7 +59,9 @@ class SEEDBenchScenario(Scenario):
     }
     name = "seed_bench"
-    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2307.16125))."
+    description = (
+        "Evaluate multimodal models on 9 evaluation aspects " "([Li et al., 2023](https://arxiv.org/abs/2307.16125))."
+    )
     tags = ["vision-language"]
     def __init__(self, subject: str):

helm/benchmark/scenarios/vision_language/unicorn_scenario.py CHANGED Viewed

@@ -55,8 +55,8 @@ class UnicornScenario(Scenario):
     name = "unicorn"
     description = (
-        "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
-        " ([paper](https://arxiv.org/abs/2311.16101))."
+        "Evaluate multimodal models on two out-of-distribution scenarios with four subjects "
+        "([Tu et al., 2023](https://arxiv.org/abs/2311.16101))."
     )
     tags = ["vision-language"]

helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py CHANGED Viewed

@@ -39,7 +39,7 @@ class VibeEvalScenario(Scenario):
     year={2024}
     }
-    Paper: https://arxiv.org/abs/2306.13394
+    Paper: https://arxiv.org/abs/2405.02287
     """
     VIBE_EVAL_HUGGINGFACE_DATASET_NAME: str = "RekaAI/VibeEval"
@@ -50,8 +50,11 @@ class VibeEvalScenario(Scenario):
     ]
     name = "vibe_eval"
-    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2405.02287))."
-    tags = ["vision-language"]
+    description = (
+        "Evaluate multimodal models on day-to-day tasks "
+        "([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
     def __init__(self, subject: str):
         super().__init__()

helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py CHANGED Viewed

@@ -53,7 +53,7 @@ class VizWizScenario(Scenario):
     name = "viz_wiz"
     description = (
         "Real-world VQA dataset consisting of questions asked by "
-        "people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
+        "people who are blind ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218))."
     )
     tags = ["vision-language", "visual question answering"]

helm/benchmark/scenarios/vision_language/vqa_scenario.py CHANGED Viewed

@@ -55,7 +55,9 @@ class VQAScenario(Scenario):
     }
     name = "vqa"
-    description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
+    description = (
+        "Open-ended questions about real-world images " "([Goyal et al., 2017](https://arxiv.org/abs/1612.00837))."
+    )
     tags = ["vision-language", "visual question answering"]
     def get_instances(self, output_path: str) -> List[Instance]:

helm/benchmark/scenarios/xstest_scenario.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import List
+from datasets import load_dataset
+from .scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
+class XSTestScenario(Scenario):
+    """
+    XSTest, a simple standardized evaluation for safety refusals.
+    """
+    name = "XSTest"
+    description = "XSTest dataset - safety benchmark"
+    tags = ["safety"]
+    def __init__(self, subject: str = "all"):
+        super().__init__()
+        self.subset = subject
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # read explicit data
+        dataset = load_dataset("walledai/XSTest", split="test", revision="f1d713187c61b6ae64e602d74f0b3d812cc2e8e8")
+        # Read all the instances
+        instances: List[Instance] = []
+        for row in dataset:
+            input = Input(text=row["prompt"])
+            references = []
+            for column_name in ["focus", "type", "note"]:
+                if row[column_name]:
+                    references += [Reference(output=Output(text=row[column_name]), tags=[])]
+            references += [Reference(output=Output(text=row["label"]), tags=[CORRECT_TAG])]
+            instance = Instance(input=input, references=references, split=TEST_SPLIT)
+            instances.append(instance)
+        return instances

helm/benchmark/server.py CHANGED Viewed

@@ -113,11 +113,6 @@ def main():
         default=None,
         help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
     )
-    parser.add_argument(
-        "--jquery",
-        action="store_true",
-        help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
-    )
     args = parser.parse_args()
     if args.suite and args.release:
@@ -126,7 +121,7 @@ def main():
     # Determine the location of the static directory.
     # This is a hack: it assumes that the static directory has a physical location,
     # which is not always the case (e.g. when using zipimport).
-    static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
+    static_package_name = "helm.benchmark.static_build"
     resource_path = resources.files(static_package_name).joinpath("index.html")
     with resources.as_file(resource_path) as resource_filename:
         static_path = str(resource_filename.parent)

crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl