PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py ADDED Viewed

@@ -0,0 +1,169 @@
+import os.path
+from typing import Dict, List
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists
+class MultipanelVQAScenario(Scenario):
+    """
+    Muffin or Chihuahua? Challenging Large Vision-Language Models with Multipanel VQA
+    We introduce Multipanel Visual Question Answering (MultipanelVQA), a novel benchmark
+    comprising 6,600 triplets of questions, answers, and multipanel images that specifically
+    challenge models in comprehending multipanel images. Our evaluation shows that questions in
+    the MultipanelVQA benchmark pose significant challenges to the state-of-the-art Large Vision
+    Language Models (LVLMs) tested, even though humans can attain approximately 99% accuracy on
+    these questions. There are two types of questions in two different situations in the
+    MultipanelVQA benchmark: multiple-choice or open-ended generation paired with real-world or
+    synthetic images. We use the multiple-choice metrics and the exact match metric for two
+    different question-answering types, respectively.
+    @article{fan2024muffin,
+    title={Muffin or Chihuahua? Challenging Large Vision-Language Models with Multipanel VQA},
+    author={Fan, Yue and Gu, Jing and Zhou, Kaiwen and Yan, Qianqi and Jiang, Shan and
+    Kuo, Ching-Chen and Guan, Xinze and Wang, Xin Eric},
+    journal={arXiv preprint arXiv:2401.15847},
+    year={2024}
+    }
+    Paper: https://arxiv.org/abs/2401.15847
+    """
+    MULTIPANELVQA_HUGGINGFACE_DATASET_NAME: Dict[str, str] = {
+        "synthetic": "yfan1997/MultipanelVQA_synthetic",
+        "real-world": "yfan1997/MultipanelVQA_real-world",
+    }
+    SUBJECTS: List[str] = ["synthetic", "real-world"]
+    name = "multipanelvqa"
+    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2401.15847))."
+    tags = ["vision-language"]
+    def __init__(self, subject: str, question_type: str):
+        super().__init__()
+        assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
+        self._subject: str = subject
+        assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
+        self._question_type: str = question_type
+    def convert_text_answer_to_option(self, text_answer: str, question: str):
+        option_answer: str
+        # Some answer may have a ')' with it
+        if len(text_answer) <= 3:
+            option_answer = text_answer[0]
+        else:
+            # There are examples where the answer is the text answer
+            # instead of an option
+            for line in question.split("\n"):
+                if text_answer in line:
+                    option_answer = line[0]
+                    break
+        return option_answer.upper()
+    def split_options_and_question(self, original_question: str):
+        question_and_options: List[str] = [item.strip().lower() for item in original_question.split("\n")]
+        last_append_phrase: str = "(please select one)"
+        question: str = question_and_options[0]
+        options: List[str] = []
+        if len(question_and_options) >= 6:
+            for item in question_and_options[1:]:
+                if last_append_phrase in item:
+                    break
+                options.append(item[3:])
+        elif len(question_and_options) == 5:
+            for item in question_and_options[1:]:
+                if last_append_phrase in item:
+                    item = item[: -len(last_append_phrase)]
+                options.append(item[3:])
+        return question, options
+    def get_instances(self, output_path: str) -> List[Instance]:
+        images_path: str = os.path.join(output_path, "images")
+        ensure_directory_exists(images_path)
+        # There is only the test split in Unicorn benchmark
+        instances: List[Instance] = []
+        # Process the test set
+        # Two open-ended generation instances and
+        # one multi-choice generation instance per row
+        for image_index, row in enumerate(
+            tqdm(
+                load_dataset(
+                    self.MULTIPANELVQA_HUGGINGFACE_DATASET_NAME[self._subject],
+                    split=TEST_SPLIT,
+                    cache_dir=output_path,
+                )
+            )
+        ):
+            # Download the image
+            # Save the image locally
+            image_path: str = os.path.join(images_path, f"{image_index}.png")
+            if not os.path.exists(image_path):
+                row["image"].save(image_path)
+            # Add the references
+            references: List[Reference] = []
+            question: str
+            answer: str
+            content: List[MediaObject]
+            if self._question_type == "open":
+                question_1: str = row["question_1"]
+                question_2: str = row["question_2"]
+                answer_1: str = row["answer_1"]
+                answer_2: str = row["answer_2"]
+                for answer, question in zip([answer_1, answer_2], [question_1, question_2]):
+                    content = [
+                        MediaObject(location=image_path, content_type="image/png"),
+                        MediaObject(text=question, content_type="text/plain"),
+                    ]
+                    instances.append(
+                        Instance(
+                            Input(multimedia_content=MultimediaObject(content)),
+                            references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                            split=TEST_SPLIT,
+                        )
+                    )
+            else:
+                options: List[str]
+                original_question: str = row["question_3"]
+                question, options = self.split_options_and_question(original_question)
+                answer = row["answer_3"].strip()
+                answer = self.convert_text_answer_to_option(answer, original_question)
+                # The given correct answer is a letter, but we need an index
+                correct_answer_index: int = ord(answer) - ord("A")
+                # The options are originally appended to the question
+                for i, option in enumerate(options):
+                    reference: Reference
+                    is_correct: bool = i == correct_answer_index
+                    reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                    references.append(reference)
+                content = [
+                    MediaObject(location=image_path, content_type="image/png"),
+                    MediaObject(text=question, content_type="text/plain"),
+                ]
+                instances.append(
+                    Instance(
+                        Input(multimedia_content=MultimediaObject(content)),
+                        references=references,
+                        split=TEST_SPLIT,
+                    )
+                )
+        return instances

helm/benchmark/scenarios/vision_language/pope_scenario.py ADDED Viewed

@@ -0,0 +1,104 @@
+from typing import List
+import os
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists
+class POPEScenario(Scenario):
+    """
+    POPE dataset
+    Despite the promising progress on Large Vision-Language Models (LVLMs), we find that LVLMs suffer from
+    the hallucination problem, i.e. they tend to generate objects that are inconsistent with the target
+    images in the descriptions. To investigate it, this work presents the first systematic study on object
+    hallucination of LVLMs based on VQAv2 benchmark. We find that: objects that frequently occur in the
+    visual instructions or co-occur with the image objects, are obviously prone to be hallucinated by LVLMs.
+    In POPE, images from VQAv2 are matched with questions asking the appearance of certain objects in the
+    image. We use the exact match metric for model evaluation on POPE.
+    @inproceedings{li2023evaluating,
+    title={Evaluating Object Hallucination in Large Vision-Language Models},
+    author={Li, Yifan and Du, Yifan and Zhou, Kun and Wang, Jinpeng and Zhao, Wayne Xin and Wen, Ji-Rong},
+    booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
+    pages={292--305},
+    year={2023}
+    }
+    Paper: https://aclanthology.org/2023.emnlp-main.20/
+    """
+    POPE_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/POPE"
+    name = "pope"
+    description = (
+        "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
+    )
+    tags = ["vision-language", "visual question answering"]
+    options: List[str] = ["Yes", "No"]
+    def get_label_from_answer(self, answer: str):
+        label: str
+        if answer == "yes":
+            label = "A"
+        elif answer == "no":
+            label = "B"
+        else:
+            raise NotImplementedError(f"Invalid answer: {answer}")
+        return label
+    def get_instances(self, output_path: str) -> List[Instance]:
+        images_path: str = os.path.join(output_path, "images")
+        ensure_directory_exists(images_path)
+        instances: List[Instance] = []
+        for row in tqdm(
+            load_dataset(
+                self.POPE_HUGGINGFACE_DATASET_NAME,
+                split=TEST_SPLIT,
+                cache_dir=output_path,
+            )
+        ):
+            image_source: str = row["image_source"]
+            # Save the image locally
+            image_path: str = os.path.join(images_path, f"{image_source}.jpg")
+            if not os.path.exists(image_path):
+                row["image"].save(image_path)
+            question: str = row["question"]
+            answer: str = row["answer"]
+            references: List[Reference] = []
+            answer = self.get_label_from_answer(answer)
+            # The given correct answer is a letter, but we need an index
+            correct_answer_index: int = ord(answer) - ord("A")
+            # The options are originally appended to the question
+            for i, option in enumerate(self.options):
+                reference: Reference
+                is_correct: bool = i == correct_answer_index
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                references.append(reference)
+            content = [
+                MediaObject(location=image_path, content_type="image/jpeg"),
+                MediaObject(text=question, content_type="text/plain"),
+            ]
+            instances.append(
+                Instance(
+                    Input(multimedia_content=MultimediaObject(content)),
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/vision_language/seed_bench_scenario.py ADDED Viewed

@@ -0,0 +1,129 @@
+import os.path
+from typing import Dict, List
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists
+class SEEDBenchScenario(Scenario):
+    """
+    SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension
+    Based on powerful Large Language Models (LLMs), recent generative Multimodal
+    Large Language Models (MLLMs) have gained prominence as a pivotal research area.
+    In Seed-Bench, we address the evaluation of generative comprehension in MLLMs
+    as a preliminary step towards a comprehensive assessment of generative models.
+    SEED-Bench consists of 19K multiple choice questions with accurate human annotations
+    (x 6 larger than existing benchmarks), which spans 12 evaluation dimensions
+    including the comprehension of both the image and video modality. We select 9
+    evaluation aspects that take image as the input. In the benchmark,
+    Multiple-choice questions with groundtruth options derived from human
+    annotation enables an objective and efficient assessment of model performance,
+    eliminating the need for human or GPT intervention during evaluation. We employ
+    the multiple-choice metric for evaluating the performance of models.
+    @article{li2023seed,
+    title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
+    author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
+    journal={arXiv preprint arXiv:2307.16125},
+    year={2023}
+    }
+    Paper: https://arxiv.org/abs/2307.16125
+    """
+    SEED_BENCH_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/SEED-Bench"
+    SUBJECTS: Dict[str, int] = {
+        "scene-understanding": 1,
+        "instance-identity": 2,
+        "instance-attributes": 3,
+        "instance-location": 4,
+        "instances-counting": 5,
+        "spatial-relation": 6,
+        "instance-interaction": 7,
+        "visual-reasoning": 8,
+        "text-understanding": 9,
+    }
+    name = "seed_bench"
+    description = "Evaluate multimodal models on  ([paper](https://arxiv.org/abs/2307.16125))."
+    tags = ["vision-language"]
+    def __init__(self, subject: str):
+        super().__init__()
+        assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
+        self._subject: str = subject
+    def get_subject_name(self, subject_name: str) -> str:
+        return "-".join(subject_name.lower().split())
+    def get_instances(self, output_path: str) -> List[Instance]:
+        images_path: str = os.path.join(output_path, "images")
+        ensure_directory_exists(images_path)
+        # There is only the test split in Unicorn benchmark
+        instances: List[Instance] = []
+        # Process the test set
+        # Two open-ended generation instances and
+        # one multi-choice generation instance per row
+        for row in tqdm(
+            load_dataset(
+                self.SEED_BENCH_HUGGINGFACE_DATASET_NAME,
+                split=TEST_SPLIT,
+                cache_dir=output_path,
+            )
+        ):
+            question_type_key: str = self.get_subject_name(self._subject)
+            if row["question_type_id"] != self.SUBJECTS[question_type_key]:
+                continue
+            question_id: str = row["question_id"]
+            # Download the image
+            # Save the image locally
+            image_path: str = os.path.join(images_path, f"{question_id}.png")
+            if not os.path.exists(image_path):
+                # some images are CMYK mode, convert to RGB.
+                row["image"][0].convert("RGB").save(image_path, "PNG", optimize=True)
+            # Add the references
+            references: List[Reference] = []
+            question: str = row["question"]
+            answer: str
+            content: List[MediaObject]
+            options: List[str] = [row["choice_a"], row["choice_b"], row["choice_c"], row["choice_d"]]
+            answer = row["answer"].strip()
+            # The given correct answer is a letter, but we need an index
+            correct_answer_index: int = ord(answer) - ord("A")
+            # The options are originally appended to the question
+            for i, option in enumerate(options):
+                reference: Reference
+                is_correct: bool = i == correct_answer_index
+                reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
+                references.append(reference)
+            content = [
+                MediaObject(location=image_path, content_type="image/png"),
+                MediaObject(text=question, content_type="text/plain"),
+            ]
+            instances.append(
+                Instance(
+                    Input(multimedia_content=MultimediaObject(content)),
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/vision_language/unicorn_scenario.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os.path
+from typing import Dict, List
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+class UnicornScenario(Scenario):
+    """
+    How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
+    We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
+    suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
+    evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
+    to test model performance under challenging conditions. In the OOD scenario, questions are matched with
+    boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
+    with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
+    highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
+    @article{tu2023unicorns,
+    title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
+    author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
+    Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
+    journal={arXiv preprint arXiv:2311.16101},
+    year={2023}
+    }
+    Paper: https://arxiv.org/abs/2311.16101
+    """
+    UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
+    IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
+    SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
+    IMG_TYPE: Dict[str, str] = {
+        "OODCV-VQA": "jpeg",
+        "OODCV-Counterfactual": "jpeg",
+        "Sketchy-VQA": "png",
+        "Sketchy-Challenging": "png",
+    }
+    name = "unicorn"
+    description = (
+        "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
+        " ([paper](https://arxiv.org/abs/2311.16101))."
+    )
+    tags = ["vision-language"]
+    def __init__(self, subject: str):
+        super().__init__()
+        assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
+        self._subject: str = subject
+        self._image_type: str = self.IMG_TYPE[self._subject]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        images_path: str = os.path.join(output_path, "images")
+        ensure_directory_exists(images_path)
+        # There is only the test split in Unicorn benchmark
+        instances: List[Instance] = []
+        question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
+        # Process the test set
+        for row in tqdm(
+            load_dataset(
+                self.UNICORN_HUGGINGFACE_DATASET_NAME,
+                data_files=question_data_files,
+                split=TEST_SPLIT,
+                cache_dir=output_path,
+            )
+        ):
+            # Download the image
+            image_path: str = row["image_path"]
+            local_image_path: str = os.path.join(output_path, image_path)
+            ensure_file_downloaded(
+                source_url=self.IMAGE_URL.format(image_path=image_path),
+                target_path=local_image_path,
+                unpack=False,
+            )
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            answer: str = row["answer"]
+            instances.append(
+                Instance(
+                    Input(multimedia_content=MultimediaObject(content)),
+                    references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py CHANGED Viewed

@@ -13,7 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     Scenario,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from helm.common.general import ensure_file_downloaded
 class VizWizScenario(Scenario):
@@ -60,7 +60,6 @@ class VizWizScenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         # Download the questions and annotations
         annotations_path: str = os.path.join(output_path, "annotations")
-        ensure_directory_exists(annotations_path)
         ensure_file_downloaded(
             source_url=self.ANNOTATIONS_URL,
             target_path=annotations_path,

helm/benchmark/scenarios/vision_language/vqa_scenario.py CHANGED Viewed

@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
         TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
     }
-    name = "visual_question_answering"
+    name = "vqa"
     description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
     tags = ["vision-language", "visual question answering"]

helm/benchmark/scenarios/wmt_14_scenario.py CHANGED Viewed

@@ -61,7 +61,7 @@ class WMT14Scenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
             subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
-            hf_dataset: Any = load_dataset("wmt14", subset_name)
+            hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
             splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
         instances: List[Instance] = []

helm/benchmark/server.py CHANGED Viewed

@@ -70,6 +70,14 @@ def serve_benchmark_output(filename):
     return response
+@app.get("/cache/output/<filename:path>")
+def serve_cache_output(filename):
+    response = static_file(filename, root=app.config["helm.cacheoutputpath"])
+    response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
+    response.set_header("Expires", "0")
+    return response
 @app.get("/")
 @app.get("/<filename:path>")
 def serve_static(filename="index.html"):
@@ -87,6 +95,12 @@ def main():
         help="The location of the output path (filesystem path or URL)",
         default="benchmark_output",
     )
+    parser.add_argument(
+        "--cache-output-path",
+        type=str,
+        help="The location of the filesystem cache output folder (filesystem path or URL)",
+        default="prod_env/cache/output",
+    )
     parser.add_argument(
         "--suite",
         type=str,
@@ -99,6 +113,11 @@ def main():
         default=None,
         help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
     )
+    parser.add_argument(
+        "--jquery",
+        action="store_true",
+        help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
+    )
     args = parser.parse_args()
     if args.suite and args.release:
@@ -107,7 +126,8 @@ def main():
     # Determine the location of the static directory.
     # This is a hack: it assumes that the static directory has a physical location,
     # which is not always the case (e.g. when using zipimport).
-    resource_path = resources.files("helm.benchmark.static").joinpath("index.html")
+    static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
+    resource_path = resources.files(static_package_name).joinpath("index.html")
     with resources.as_file(resource_path) as resource_filename:
         static_path = str(resource_filename.parent)
@@ -117,16 +137,19 @@ def main():
         # Output path is a URL, so set the output path base URL in the frontend to that URL
         # so that the frontend reads from that URL directly.
         app.config["helm.outputpath"] = None
+        # TODO: figure out helm.cacheoutputpath
         app.config["helm.outputurl"] = args.output_path
     else:
         # Output path is a location on disk, so set the output path base URL to /benchmark_output
         # and then serve files from the location on disk at that URL.
         app.config["helm.outputpath"] = path.abspath(args.output_path)
+        app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
         app.config["helm.outputurl"] = "benchmark_output"
     app.config["helm.suite"] = args.suite or "latest"
     app.config["helm.release"] = args.release
+    print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
     app.run(host="0.0.0.0", port=args.port)

crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl