PyPI - crfm-helm - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show

{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/perturbation.py +17 -1
helm/benchmark/augmentations/test_perturbation.py +30 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/efficiency_metrics.py +9 -2
helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
helm/benchmark/model_metadata_registry.py +5 -1
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +112 -63
helm/benchmark/run_spec_factory.py +15 -10
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +444 -65
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/legalbench_scenario.py +6 -2
helm/benchmark/scenarios/math_scenario.py +1 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +447 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/schema_vhelm.yaml +824 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +78 -14
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +11 -5
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +199 -2
helm/clients/vertexai_client.py +117 -64
helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
helm/clients/vision_language/huggingface_vlm_client.py +12 -4
helm/clients/vision_language/idefics_client.py +2 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +29 -3
helm/config/model_deployments.yaml +504 -12
helm/config/model_metadata.yaml +579 -52
helm/config/tokenizer_configs.yaml +100 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/schema_vlm.yaml +0 -576
helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
helm/benchmark/static_build/assets/index-d839df55.js +0 -9
helm/benchmark/test_model_deployment_definition.py +0 -90
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/vision_language/gqa_scenario.py ADDED Viewed

@@ -0,0 +1,91 @@
+import os
+import json
+from typing import Any, Dict, List
+from helm.benchmark.scenarios.scenario import (
+    ALL_SPLITS,
+    CORRECT_TAG,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+class GQAScenario(Scenario):
+    """
+    Questions about real-world visual reasoning and compositional QA
+    @misc{hudson2019gqa,
+          title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
+          author={Drew A. Hudson and Christopher D. Manning},
+          year={2019},
+          eprint={1902.09506},
+          archivePrefix={arXiv},
+          primaryClass={cs.CL}
+    }
+    Paper: https://arxiv.org/abs/1902.09506
+    Website: https://github.com/stanford-crfm/helm/issues/1951
+    """
+    QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
+    IMAGES_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
+    name = "gqa"
+    description = (
+        "Questions about real-world visual reasoning and compositional QA "
+        "([paper](https://arxiv.org/abs/1902.09506))."
+    )
+    tags = ["vision-language", "reasoning"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        questions_path: str = os.path.join(output_path, "questions")
+        ensure_file_downloaded(
+            source_url=self.QUESTIONS_URL, target_path=questions_path, unpack=True, unpack_type="unzip"
+        )
+        images_path: str = os.path.join(output_path, "images")
+        ensure_file_downloaded(source_url=self.IMAGES_URL, target_path=images_path, unpack=True, unpack_type="unzip")
+        instances: List[Instance] = []
+        for helm_split in ALL_SPLITS:
+            if helm_split == TEST_SPLIT:
+                # The test split doesn't have annotations
+                continue
+            split: str = "val" if helm_split == VALID_SPLIT else helm_split
+            # Read the questions from the JSON
+            questions_split_path: str = os.path.join(questions_path, f"{split}_balanced_questions.json")
+            with open(questions_split_path, "r") as questions_file:
+                questions: Dict[str, Any] = json.load(questions_file)
+                for question_id, question_data in questions.items():
+                    question: str = question_data["question"]
+                    short_answer: str = question_data["answer"]
+                    full_answer: str = question_data["fullAnswer"]
+                    image_id: str = question_data["imageId"]
+                    local_image_path: str = os.path.join(images_path, f"{image_id}.jpg")
+                    content: List[MediaObject] = [
+                        MediaObject(text=question, content_type="text/plain"),
+                        MediaObject(location=local_image_path, content_type="image/jpeg"),
+                    ]
+                    instances.append(
+                        Instance(
+                            Input(multimedia_content=MultimediaObject(content)),
+                            references=[
+                                Reference(Output(text=short_answer), tags=[CORRECT_TAG]),
+                                Reference(Output(text=full_answer), tags=[CORRECT_TAG]),
+                            ],
+                            split=helm_split,
+                        )
+                    )
+        return instances

helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py CHANGED Viewed

@@ -80,11 +80,13 @@ class HatefulMemesScenario(Scenario):
                     MediaObject(location=local_image_path, content_type="image/jpeg"),
                     MediaObject(text=self.QUESTION, content_type="text/plain"),
                 ]
-                answer: str = "Yes" if row["label"] == 1 else "No"
                 instances.append(
                     Instance(
                         Input(multimedia_content=MultimediaObject(content)),
-                        references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                        references=[
+                            Reference(Output(text="Yes"), tags=[CORRECT_TAG] if row["label"] == 1 else []),
+                            Reference(Output(text="No"), tags=[CORRECT_TAG] if row["label"] == 0 else []),
+                        ],
                         split=split,
                     )
                 )

helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py CHANGED Viewed

@@ -22,6 +22,10 @@ from helm.common.general import ensure_directory_exists
 from helm.common.hierarchical_logger import hlog
 PROCESSED: str = "processed"
+DIFFICULTY_ALL = "all"
+DIFFICULTY_EASY = "easy"
+DIFFICULTY_MEDIUM = "medium"
+DIFFICULTY_HARD = "hard"
 class Image2StructureScenario(Scenario):
@@ -38,13 +42,16 @@ class Image2StructureScenario(Scenario):
         VALID_SPLIT: "validation",
     }
-    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
+    def __init__(
+        self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT, difficulty: str = DIFFICULTY_ALL
+    ):
         super().__init__()
         assert subset in self.SUBSETS, f"Invalid subset: {subset}"
         self._subset: str = subset
         self._recompile_prompt: bool = recompile_prompt
         self._split: str = split
         self._output_path: Optional[str] = None
+        self._difficulty: str = difficulty
     def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
         # By default, there are no assets
@@ -110,6 +117,10 @@ class Image2StructureScenario(Scenario):
                 )
                 continue
+            # Filter by difficulty
+            if self._difficulty != DIFFICULTY_ALL and row["difficulty"] != self._difficulty:
+                continue
             # Step 1: Preprocess the row
             row = self.preprocess_row(row, assets_path)
@@ -158,7 +169,7 @@ class Image2StructureScenario(Scenario):
                     # representing the structure (such as LaTeX code)
                     multimedia_object = MultimediaObject([image_object])
                 reference = Reference(
-                    output=Output(text=row["text"], multimedia_content=multimedia_object),
+                    output=Output(text=row["text"] if "text" in row else "", multimedia_content=multimedia_object),
                     tags=[CORRECT_TAG],
                 )
             else:

helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from helm.benchmark.scenarios.scenario import VALID_SPLIT
 from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
     latex_to_image,
     strip_unnecessary_latex_parts,
@@ -9,14 +8,11 @@ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_sc
 class LatexScenario(Image2StructureScenario):
     BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters."  # noqa: E501
     HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
-    SUBSETS = ["equation", "table", "plot", "algorithm"]
+    SUBSETS = ["equation", "table", "plot", "algorithm", "real"]
     name = "image2latex"
     description = "Evaluate multimodal models on Latex generation to recreate a provided image"
-    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
-        super().__init__(subset, recompile_prompt, split)
     def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
         image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
         image.save(destination_path)

helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py CHANGED Viewed

@@ -1,10 +1,9 @@
-from helm.benchmark.scenarios.scenario import VALID_SPLIT
 from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
 class MusicSheetScenario(Image2StructureScenario):
     BASE_PROMPT = (
-        "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n"  # noqa: E501
+        "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasibly possible.\n"  # noqa: E501
         "This music sheet was created by me, and I would like to recreate it using Lilypond."
     )
     HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
@@ -13,8 +12,5 @@ class MusicSheetScenario(Image2StructureScenario):
     name = "image2musicsheet"
     description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
-    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
-        super().__init__(subset, recompile_prompt, split)
     def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
         raise Exception("Music sheets have no ground truth, compilation is not possible")

helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ from helm.benchmark.scenarios.scenario import VALID_SPLIT
 from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
     Image2StructureScenario,
     PROCESSED,
+    DIFFICULTY_ALL,
 )
 from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
 from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
@@ -123,12 +124,12 @@ class WebpageScenario(Image2StructureScenario):
         "  }\n"
         "]\n"
         "You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary,"  # noqa: E501
-        " they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
+        " they will be created for you automatically. Try to write some realistic code keeping in mind that it should"
         " look like the image as much as feasibly possible."
     )
     HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
-    SUBSETS = ["css", "html", "javascript"]
+    SUBSETS = ["css", "html", "javascript", "real"]
     MAX_TRIES: int = 5
     ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
@@ -140,9 +141,10 @@ class WebpageScenario(Image2StructureScenario):
         subset: str,
         recompile_prompt: bool = True,
         split: str = VALID_SPLIT,
+        difficulty: str = DIFFICULTY_ALL,
         screenshot_options: ScreenshotOptions = ScreenshotOptions(),
     ):
-        super().__init__(subset, recompile_prompt, split)
+        super().__init__(subset, recompile_prompt, split, difficulty)
         self._screenshot_options = screenshot_options
         self._html2text = HTML2Text()
         self._html2text.ignore_links = True

helm/benchmark/scenarios/vision_language/math_vista_scenario.py ADDED Viewed

@@ -0,0 +1,117 @@
+import os
+from typing import List
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists
+class MathVistaScenario(Scenario):
+    """
+    MathVista: Evaluating Math Reasoning in Visual Contexts
+    To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse
+    mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets
+    involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these
+    tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art
+    foundation models find challenging.
+        @inproceedings{lu2024mathvista,
+          author    = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi,
+                       Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
+          title     = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
+          booktitle={International Conference on Learning Representations (ICLR)},
+          year      = {2024}
+        }
+    Paper: https://arxiv.org/abs/2310.02255
+    Website: https://mathvista.github.io/
+    """
+    HUGGINGFACE_DATASET_NAME: str = "AI4Math/MathVista"
+    # Only the testmini split has answers
+    SPLIT: str = "testmini"
+    # Supported difficulties
+    GRADES: List[str] = ["elementary_school", "high_school", "college", "daily_life"]
+    QUESTION_TYPES: List[str] = ["multi_choice", "free_form"]
+    name = "math_vista"
+    description = (
+        "A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
+        "([paper](https://arxiv.org/abs/2310.02255))."
+    )
+    tags = ["vision-language", "reasoning", "math"]
+    def __init__(self, grade: str, question_type: str):
+        super().__init__()
+        assert grade in self.GRADES, f"Not supported: {grade}"
+        self._grade: str = grade.replace("_", " ")
+        assert question_type in self.QUESTION_TYPES, f"Invalid question type: {question_type}"
+        self._question_type: str = question_type
+    def get_instances(self, output_path: str) -> List[Instance]:
+        ensure_directory_exists(os.path.join(output_path, "images"))
+        instances: List[Instance] = []
+        for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=self.SPLIT, cache_dir=output_path)):
+            # Filter out the questions by type and grade (or difficulty)
+            if row["question_type"] != self._question_type or row["metadata"]["grade"] != self._grade:
+                continue
+            pid: str = row["pid"]
+            question: str = row["question"]
+            answer: str = row["answer"]
+            # Save the image locally
+            assert row["image"] == f"images/{pid}.jpg", f"Invalid image path: {row['image']} for question {pid}"
+            image_path: str = os.path.join(output_path, row["image"])
+            if not os.path.exists(image_path):
+                image = row["decoded_image"]
+                if image.mode in ("RGBA", "P", "LA"):
+                    image = image.convert("RGB")
+                image.save(image_path)
+            content: List[MediaObject] = [
+                MediaObject(text=question, content_type="text/plain"),
+                MediaObject(location=image_path, content_type="image/jpeg"),
+            ]
+            # Add the references
+            references: List[Reference] = []
+            if self._question_type == "multi_choice":
+                options: List[str] = row["choices"]
+                for option in options:
+                    references.append(Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else []))
+            else:
+                references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
+                if row["unit"] is not None:
+                    references.append(Reference(Output(text=f"{answer} {row['unit']}"), tags=[CORRECT_TAG]))
+            instances.append(
+                Instance(
+                    Input(multimedia_content=MultimediaObject(content)),
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+            )
+        assert (
+            len(instances) > 0
+        ), f"No instances found for subject {self._grade} and question type {self._question_type}"
+        return instances

helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py ADDED Viewed

@@ -0,0 +1,103 @@
+import os
+import json
+from typing import Any, Dict, List
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+class MMSafetyBenchScenario(Scenario):
+    """
+    To evaluate the extent of this vulnerability in open-source VLMs, compiled a substantial dataset encompassing
+    13 scenarios with a total of 5,040 text-image pairs
+    @misc{liu2023queryrelevant,
+          title={Query-Relevant Images Jailbreak Large Multi-Modal Models},
+          author={Xin Liu and Yichen Zhu and Yunshi Lan and Chao Yang and Yu Qiao},
+          year={2023},
+          eprint={2311.17600},
+          archivePrefix={arXiv},
+          primaryClass={cs.CV}
+    }
+    Paper: https://arxiv.org/abs/2311.17600
+    Website: https://isxinliu.github.io/Project/MM-SafetyBench/
+    Questions: https://github.com/isXinLiu/MM-SafetyBench/tree/main/data/processed_questions
+    """
+    SUBSET_TO_DATASET_FOLDER: Dict[str, str] = {
+        "illegal_activity": "01-Illegal_Activitiy",  # This is intentionally misspelled to match the original name
+        "hate_speech": "02-HateSpeech",
+        "malware_generation": "03-Malware_Generation",
+        "physical_harm": "04-Physical_Harm",
+        "economic_harm": "05-EconomicHarm",
+        "fraud": "06-Fraud",
+        "sex": "07-Sex",
+        "political_lobbying": "08-Political_Lobbying",
+        "privacy_violence": "09-Privacy_Violence",
+        "legal_opinion": "10-Legal_Opinion",
+        "financial_advice": "11-Financial_Advice",
+        "health_consultation": "12-Health_Consultation",
+        "government_decision": "13-Gov_Decision",
+    }
+    QUESTIONS_URL_TEMPLATE: str = (
+        "https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/" "processed_questions/{dataset}.json"
+    )
+    IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
+    name = "mm_safety_bench"
+    description = (
+        "Expose the vulnerability of open-source VLMs with toxic and biased content "
+        "([paper](https://arxiv.org/abs/2311.17600))."
+    )
+    tags = ["vision-language", "bias", "toxicity"]
+    def __init__(self, subset: str):
+        super().__init__()
+        assert subset in self.SUBSET_TO_DATASET_FOLDER, f"Invalid subset: {subset}"
+        self._dataset: str = self.SUBSET_TO_DATASET_FOLDER[subset]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download all the images
+        images_path: str = os.path.join(output_path, "MM-SafetyBench(imgs)")
+        assert os.path.exists(images_path), (
+            f"Images path does not exist: {images_path}. Download the images "
+            f"from {self.IMAGES_URL}, unzip and place it at {output_path}"
+        )
+        # SD_TYPO seems to have the greatest attack success rate on the models they evaluated
+        images_path = os.path.join(images_path, self._dataset, "SD_TYPO")
+        assert os.path.exists(images_path)
+        questions_path: str = os.path.join(output_path, f"{self._dataset}.json")
+        questions_url: str = self.QUESTIONS_URL_TEMPLATE.format(dataset=self._dataset)
+        ensure_file_downloaded(source_url=questions_url, target_path=questions_path)
+        instances: List[Instance] = []
+        with open(questions_path, "r") as questions_file:
+            questions: Dict[str, Any] = json.load(questions_file)
+            for question_id, question_data in questions.items():
+                local_image_path: str = os.path.join(images_path, f"{question_id}.jpg")
+                assert os.path.exists(local_image_path), f"Image does not exist: {local_image_path}"
+                question: str = question_data["Rephrased Question"]
+                content: List[MediaObject] = [
+                    MediaObject(location=local_image_path, content_type="image/jpeg"),
+                    MediaObject(text=question, content_type="text/plain"),
+                ]
+                instances.append(
+                    Instance(
+                        Input(multimedia_content=MultimediaObject(content)),
+                        references=[],
+                        split=TEST_SPLIT,
+                    )
+                )
+        return instances

helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py ADDED Viewed

@@ -0,0 +1,92 @@
+import json
+import os
+from collections import defaultdict
+from typing import Any, Dict, List
+from helm.common.general import ensure_file_downloaded
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+)
+class MSCOCOCaptioningScenario(Scenario):
+    """
+    Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
+    It has 330K images, with over 200K of them labeled. We use the 2014 version of the dataset instead
+    of the 2017 version because of the larger validation set. According to https://cocodataset.org/#download,
+    the 2014 version has 83K images in the train split and 41K in the val split.
+    Each image also has five captions. For example, image #335111 has the following five captions:
+        1. a row of bikes on the sidewalk, 2 on the ground.
+        2. a couple of bikes laying on their sides on a sidewalk.
+        3. a person wearing a black coat with a hood stands on the street, near many bikes
+        4. a woman standing in front of a row of bicycles in front of a bus stop with two bikes knocked over
+        5. there are some bicycles laying on their sides
+    Paper: https://arxiv.org/abs/1405.0312
+    Website: https://cocodataset.org/#home
+    """
+    ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
+    SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2014.zip"
+    COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
+    name = "mscoco"
+    description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
+    tags = ["text-to-image", "image-to-text"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the annotations which contains the image IDs, filenames and captions
+        data_path: str = os.path.join(output_path, "data")
+        ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
+        instances: List[Instance] = []
+        for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
+            # Download the images of the split
+            split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
+            split_path: str = os.path.join(data_path, coco_split)
+            ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
+            # Read the metadata for the split
+            metadata_path: str = os.path.join(data_path, f"captions_{coco_split}2014.json")
+            with open(metadata_path, "r") as f:
+                metadata: Dict[str, Any] = json.load(f)
+            # Get the path of each image
+            image_id_to_path: Dict[int, str] = {
+                image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
+                for image_metadata in metadata["images"]
+            }
+            # Gather the five captions for each image
+            image_id_to_captions: Dict[int, List[str]] = defaultdict(list)
+            for annotation in metadata["annotations"]:
+                image_id_to_captions[annotation["image_id"]].append(annotation["caption"])
+            # Create instances
+            for image_id in image_id_to_path:
+                image_path: str = image_id_to_path[image_id]
+                captions: List[str] = image_id_to_captions[image_id]
+                content: List[MediaObject] = [
+                    MediaObject(location=image_path, content_type="image/jpeg"),
+                ]
+                instances.append(
+                    Instance(
+                        Input(multimedia_content=MultimediaObject(content)),
+                        references=[
+                            Reference(Output(text=caption.rstrip()), tags=[CORRECT_TAG]) for caption in captions
+                        ],
+                        split=helm_split,
+                    )
+                )
+        return instances

helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py ADDED Viewed

@@ -0,0 +1,117 @@
+import json
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Set
+from helm.common.general import ensure_file_downloaded
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+)
+class MSCOCOCategorizationScenario(Scenario):
+    """
+    Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
+    It has 330K images, with over 200K of them labeled. We use the 2017 version of the dataset
+    for the categorization task.
+    Paper: https://arxiv.org/abs/1405.0312
+    Website: https://cocodataset.org/#home
+    """
+    ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip"
+    SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2017.zip"
+    COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
+    name = "mscoco"
+    description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
+    tags = ["text-to-image", "image-to-text"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the annotations which contains the image IDs, filenames and captions
+        data_path: str = os.path.join(output_path, "data_2017")
+        ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
+        super_categories_to_categories: Dict[str, List[str]] = defaultdict(list)
+        category_id_to_category: Dict[int, str] = {}
+        category_id_to_super_category: Dict[int, str] = {}
+        instances: List[Instance] = []
+        for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
+            # Download the images of the split
+            split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
+            split_path: str = os.path.join(data_path, coco_split)
+            ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
+            # Read the metadata for the split
+            metadata_path: str = os.path.join(data_path, f"stuff_{coco_split}2017.json")
+            with open(metadata_path, "r") as f:
+                metadata: Dict[str, Any] = json.load(f)
+            for category_metadata in metadata["categories"]:
+                # Each metadata looks like this {'supercategory': 'textile', 'id': 92, 'name': 'banner'}
+                category_id: int = category_metadata["id"]
+                category: str = category_metadata["name"]
+                super_category: str = category_metadata["supercategory"]
+                super_categories_to_categories[super_category].append(category)
+                category_id_to_category[category_id] = category
+                category_id_to_super_category[category_id] = super_category
+            # Get the path of each image
+            image_id_to_path: Dict[int, str] = {
+                image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
+                for image_metadata in metadata["images"]
+            }
+            # Gather the five captions for each image
+            image_id_to_category_ids: Dict[int, List[int]] = defaultdict(list)
+            for annotation in metadata["annotations"]:
+                image_id_to_category_ids[annotation["image_id"]].append(annotation["category_id"])
+            # Create instances
+            for image_id in image_id_to_path:
+                image_path: str = image_id_to_path[image_id]
+                assert os.path.exists(image_path), f"Image path {image_path} does not exist"
+                category_ids: List[int] = image_id_to_category_ids[image_id]
+                content: List[MediaObject] = [
+                    MediaObject(location=image_path, content_type="image/jpeg"),
+                ]
+                references: List[Reference] = []
+                correct_super_categories: Set[str] = set(
+                    category_id_to_super_category[category_id] for category_id in category_ids
+                )
+                # for category_id in category_ids:
+                #     category = category_id_to_category[category_id]
+                #     super_category = category_id_to_super_category[category_id]
+                #     references.extend(
+                #         [
+                #             Reference(Output(text=category), tags=[CORRECT_TAG]),
+                #             Reference(Output(text=super_category), tags=[CORRECT_TAG]),
+                #         ]
+                #     )
+                for super_category in super_categories_to_categories:
+                    references.append(
+                        Reference(
+                            Output(text=super_category),
+                            tags=[CORRECT_TAG] if super_category in correct_super_categories else [],
+                        )
+                    )
+                instances.append(
+                    Instance(
+                        Input(multimedia_content=MultimediaObject(content)),
+                        references=references,
+                        split=helm_split,
+                    )
+                )
+        return instances

crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl