PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show

helm/benchmark/scenarios/vision_language/unicorn_scenario.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os.path
+from typing import Dict, List
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+class UnicornScenario(Scenario):
+    """
+    How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
+    We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
+    suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
+    evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
+    to test model performance under challenging conditions. In the OOD scenario, questions are matched with
+    boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
+    with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
+    highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
+    @article{tu2023unicorns,
+    title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
+    author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
+    Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
+    journal={arXiv preprint arXiv:2311.16101},
+    year={2023}
+    }
+    Paper: https://arxiv.org/abs/2311.16101
+    """
+    UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
+    IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
+    SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
+    IMG_TYPE: Dict[str, str] = {
+        "OODCV-VQA": "jpeg",
+        "OODCV-Counterfactual": "jpeg",
+        "Sketchy-VQA": "png",
+        "Sketchy-Challenging": "png",
+    }
+    name = "unicorn"
+    description = (
+        "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
+        " ([paper](https://arxiv.org/abs/2311.16101))."
+    )
+    tags = ["vision-language"]
+    def __init__(self, subject: str):
+        super().__init__()
+        assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
+        self._subject: str = subject
+        self._image_type: str = self.IMG_TYPE[self._subject]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        images_path: str = os.path.join(output_path, "images")
+        ensure_directory_exists(images_path)
+        # There is only the test split in Unicorn benchmark
+        instances: List[Instance] = []
+        question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
+        # Process the test set
+        for row in tqdm(
+            load_dataset(
+                self.UNICORN_HUGGINGFACE_DATASET_NAME,
+                data_files=question_data_files,
+                split=TEST_SPLIT,
+                cache_dir=output_path,
+            )
+        ):
+            # Download the image
+            image_path: str = row["image_path"]
+            local_image_path: str = os.path.join(output_path, image_path)
+            ensure_file_downloaded(
+                source_url=self.IMAGE_URL.format(image_path=image_path),
+                target_path=local_image_path,
+                unpack=False,
+            )
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            answer: str = row["answer"]
+            instances.append(
+                Instance(
+                    Input(multimedia_content=MultimediaObject(content)),
+                    references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py CHANGED Viewed

@@ -13,13 +13,13 @@ from helm.benchmark.scenarios.scenario import (
     Scenario,
 )
 from helm.common.media_object import MediaObject, MultimediaObject
-from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from helm.common.general import ensure_file_downloaded
 class VizWizScenario(Scenario):
     """
-    VizWiz is a real-world visual question answering dataset consisting of questions
-    asked by people who are blind. It originates from a natural visual question answering
+    VizWiz is a real-world visual question answering dataset consisting of questions asked by people who are
+    visually impaired. It originates from a natural visual question answering
     setting where blind people each took an image and recorded a spoken question about it,
     together with 10 crowdsourced answers per visual question.
@@ -60,7 +60,6 @@ class VizWizScenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         # Download the questions and annotations
         annotations_path: str = os.path.join(output_path, "annotations")
-        ensure_directory_exists(annotations_path)
         ensure_file_downloaded(
             source_url=self.ANNOTATIONS_URL,
             target_path=annotations_path,

helm/benchmark/scenarios/vision_language/vqa_scenario.py CHANGED Viewed

@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
         TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
     }
-    name = "visual_question_answering"
+    name = "vqa"
     description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
     tags = ["vision-language", "visual question answering"]
@@ -111,11 +111,13 @@ class VQAScenario(Scenario):
                     MediaObject(location=image_path, content_type="image/jpeg"),
                     MediaObject(text=question_json["question"], content_type="text/plain"),
                 ]
                 instances.append(
                     Instance(
                         Input(multimedia_content=MultimediaObject(content)),
-                        references=[Reference(Output(text=answers_json["multiple_choice_answer"]), tags=[CORRECT_TAG])],
+                        references=[
+                            Reference(Output(text=answer_json["answer"]), tags=[CORRECT_TAG])
+                            for answer_json in answers_json["answers"]
+                        ],
                         split=split,
                     )
                 )

helm/benchmark/scenarios/wmt_14_scenario.py CHANGED Viewed

@@ -61,7 +61,7 @@ class WMT14Scenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
             subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
-            hf_dataset: Any = load_dataset("wmt14", subset_name)
+            hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
             splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
         instances: List[Instance] = []

helm/benchmark/server.py CHANGED Viewed

@@ -70,6 +70,14 @@ def serve_benchmark_output(filename):
     return response
+@app.get("/cache/output/<filename:path>")
+def serve_cache_output(filename):
+    response = static_file(filename, root=app.config["helm.cacheoutputpath"])
+    response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
+    response.set_header("Expires", "0")
+    return response
 @app.get("/")
 @app.get("/<filename:path>")
 def serve_static(filename="index.html"):
@@ -87,6 +95,12 @@ def main():
         help="The location of the output path (filesystem path or URL)",
         default="benchmark_output",
     )
+    parser.add_argument(
+        "--cache-output-path",
+        type=str,
+        help="The location of the filesystem cache output folder (filesystem path or URL)",
+        default="prod_env/cache/output",
+    )
     parser.add_argument(
         "--suite",
         type=str,
@@ -99,6 +113,11 @@ def main():
         default=None,
         help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
     )
+    parser.add_argument(
+        "--jquery",
+        action="store_true",
+        help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
+    )
     args = parser.parse_args()
     if args.suite and args.release:
@@ -107,7 +126,8 @@ def main():
     # Determine the location of the static directory.
     # This is a hack: it assumes that the static directory has a physical location,
     # which is not always the case (e.g. when using zipimport).
-    resource_path = resources.files("helm.benchmark.static").joinpath("index.html")
+    static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
+    resource_path = resources.files(static_package_name).joinpath("index.html")
     with resources.as_file(resource_path) as resource_filename:
         static_path = str(resource_filename.parent)
@@ -117,16 +137,19 @@ def main():
         # Output path is a URL, so set the output path base URL in the frontend to that URL
         # so that the frontend reads from that URL directly.
         app.config["helm.outputpath"] = None
+        # TODO: figure out helm.cacheoutputpath
         app.config["helm.outputurl"] = args.output_path
     else:
         # Output path is a location on disk, so set the output path base URL to /benchmark_output
         # and then serve files from the location on disk at that URL.
         app.config["helm.outputpath"] = path.abspath(args.output_path)
+        app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
         app.config["helm.outputurl"] = "benchmark_output"
     app.config["helm.suite"] = args.suite or "latest"
     app.config["helm.release"] = args.release
+    print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
     app.run(host="0.0.0.0", port=args.port)

helm/benchmark/slurm_runner.py CHANGED Viewed

@@ -10,6 +10,10 @@ import sys
 from helm.common.codec import from_json, to_json
 from helm.common.general import write
+from helm.benchmark.config_registry import (
+    register_configs_from_directory,
+    register_builtin_configs_from_helm_package,
+)
 from helm.benchmark.executor import ExecutionSpec
 from helm.benchmark.runner import Runner, RunSpec, RunnerError
 from helm.benchmark.slurm_jobs import (
@@ -24,10 +28,11 @@ from helm.benchmark.slurm_jobs import (
 from helm.common.general import ensure_directory_exists
 from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.benchmark.runner_config_registry import RUNNER_CONFIG
-_DEFAULT_MAX_CONCURRENT_WORKER_SLURM_JOBS = 8
 _MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKER_SLURM_JOBS"
 _SLURM_NODE_NAMES_ENV_NAME = "HELM_SLURM_NODE_NAMES"
+_DEFAULT_MAX_CONCURRENT_WORKER_SLURM = 8
 @dataclass
@@ -89,12 +94,15 @@ class SlurmRunner(Runner):
         self.slurm_runner_spec_path = os.path.join(self.slurm_base_dir, "slurm_runner_spec.json")
         # Configure max concurrent worker Slurm jobs from the environment variable.
-        # TODO: Read from a configuration file instead
         env_max_concurrent_worker_slurm_jobs = os.getenv(_MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME)
         self.max_concurrent_worker_slurm_jobs = (
             int(env_max_concurrent_worker_slurm_jobs)
             if env_max_concurrent_worker_slurm_jobs
-            else _DEFAULT_MAX_CONCURRENT_WORKER_SLURM_JOBS
+            else (
+                RUNNER_CONFIG.helm_max_concurrent_workers
+                if RUNNER_CONFIG.helm_max_concurrent_workers > 0
+                else _DEFAULT_MAX_CONCURRENT_WORKER_SLURM
+            )
         )
     def run_all(self, run_specs: List[RunSpec]):
@@ -222,8 +230,7 @@ class SlurmRunner(Runner):
                         break
                     # Refresh every minute
-                    # TODO: Make this period configurable
-                    time.sleep(60)
+                    time.sleep(RUNNER_CONFIG.slurm_monitor_interval)
         finally:
             # Cleanup by cancelling all jobs during program termination or if an exception is raised.
             cancel_all_jobs()
@@ -261,34 +268,48 @@ class SlurmRunner(Runner):
                 run_spec_path,
             ]
         )
-        # TODO: Make default Slurm arguments configurable.
-        raw_slurm_args: Dict[str, str] = {
-            "account": "nlp",
-            "cpus_per_task": "4",
-            "mem": "32G",
-            "gres": "gpu:0",
-            "open_mode": "append",
-            "partition": "john",
-            "time": "14-0",  # Deadline of 14 days
-            "mail_type": "FAIL",
-            "job_name": run_name,
-            "output": log_path,
-            "chdir": os.getcwd(),
-        }
-        # TODO: Move resource requirements into RunSpec.
-        slurm_node_names = os.getenv(_SLURM_NODE_NAMES_ENV_NAME)
-        if run_spec.name.startswith("msmarco:"):
-            raw_slurm_args["mem"] = "64G"
-        if "device=cuda" in run_spec.name:
-            raw_slurm_args["gres"] = "gpu:1"
-            raw_slurm_args["partition"] = "jag-hi"
-        if "model=huggingface" in run_spec.name:
-            raw_slurm_args["gres"] = "gpu:1"
-            raw_slurm_args["partition"] = "sphinx"
-            if not slurm_node_names or "sphinx" not in slurm_node_names:
-                raise Exception(f"Environment variable {_SLURM_NODE_NAMES_ENV_NAME} must be set to sphinx node names")
-        if slurm_node_names:
-            raw_slurm_args["nodelist"] = slurm_node_names
+        if RUNNER_CONFIG.slurm_args is None:
+            raw_slurm_args: Dict[str, str] = {
+                "account": "nlp",
+                "cpus_per_task": "4",
+                "mem": "32G",
+                "gres": "gpu:0",
+                "open_mode": "append",
+                "partition": "john",
+                "time": "14-0",  # Deadline of 14 days
+                "mail_type": "FAIL",
+                "job_name": run_name,
+                "output": log_path,
+                "chdir": os.getcwd(),
+            }
+            # TODO: Move resource requirements into RunSpec.
+            slurm_node_names = os.getenv(_SLURM_NODE_NAMES_ENV_NAME)
+            if run_spec.name.startswith("msmarco:"):
+                raw_slurm_args["mem"] = "64G"
+            if "device=cuda" in run_spec.name:
+                raw_slurm_args["gres"] = "gpu:1"
+                raw_slurm_args["partition"] = "jag-hi"
+            if "model=huggingface" in run_spec.name:
+                raw_slurm_args["gres"] = "gpu:1"
+                raw_slurm_args["partition"] = "sphinx"
+                if not slurm_node_names or "sphinx" not in slurm_node_names:
+                    raise Exception(
+                        f"Environment variable {_SLURM_NODE_NAMES_ENV_NAME} must be set to sphinx node names"
+                    )
+            if slurm_node_names:
+                raw_slurm_args["nodelist"] = slurm_node_names
+        else:
+            raw_slurm_args = RUNNER_CONFIG.slurm_args
+            dynamic_slurm_args = {
+                "job_name": run_name,
+                "output": log_path,
+                "chdir": os.getcwd(),
+            }
+            # User should not set these manually, overwrite them if necessary
+            raw_slurm_args.update(dynamic_slurm_args)
         slurm_args: Dict[str, str] = {key: shlex.quote(value) for key, value in raw_slurm_args.items()}
         # Uncomment this to get notification emails from Slurm for Slurm worker jobs.
@@ -300,27 +321,15 @@ class SlurmRunner(Runner):
         return slurm_job_id
-def run_as_worker(slurm_runner_spec_path: str, run_spec_path: str):
-    """Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
-    Used by the worker Slurm jobs only."""
-    with open(slurm_runner_spec_path, "r") as f:
-        slurm_runner_spec = from_json(f.read(), SlurmRunnerSpec)
-    with open(run_spec_path, "r") as f:
-        run_spec = from_json(f.read(), RunSpec)
-    slurm_runner = SlurmRunner(**slurm_runner_spec.to_kwargs())
-    slurm_runner.run_one(run_spec)
 def main():
     """Entry point for the SlurmRunner's worker Slurm jobs that run a single RunSpec.
     This entry point should only be used by SlurmRunner. Users should use `helm-run` instead.
     SlurmRunner has to use this entry point instead of helm-run because there is no way to
     specify the worker Slurm job parameters through `helm-run`. In particular, there is no way
-    to run a specific `RunSpec` using the `--run-specs` parameter of `helm-run`, because the
-    `run-specs` argument is a `RunSpec` description (not a `RunSpec`), and there is no way to
-    convert a `RunSpec` into a `RunSpec` description."""
+    to run a specific `RunSpec` using the `--run-entries` parameter of `helm-run`, because the
+    `run-entries` argument contains `RunEntry` description (not `RunSpec`s), and there is no way to
+    convert a `RunSpec` into a `RunEntry` description."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--slurm-runner-spec-path",
@@ -335,7 +344,19 @@ def main():
         required=True,
     )
     args = parser.parse_args()
-    run_as_worker(slurm_runner_spec_path=args.slurm_runner_spec_path, run_spec_path=args.run_spec_path)
+    # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
+    with open(args.slurm_runner_spec_path, "r") as f:
+        slurm_runner_spec = from_json(f.read(), SlurmRunnerSpec)
+    with open(args.run_spec_path, "r") as f:
+        run_spec = from_json(f.read(), RunSpec)
+    register_builtin_configs_from_helm_package()
+    if slurm_runner_spec.execution_spec.local_path is not None:
+        register_configs_from_directory(slurm_runner_spec.execution_spec.local_path)
+    slurm_runner = SlurmRunner(**slurm_runner_spec.to_kwargs())
+    slurm_runner.run_one(run_spec)
 if __name__ == "__main__":

helm/benchmark/static/benchmarking.js CHANGED Viewed

@@ -492,7 +492,7 @@ $(function () {
               {{~#if perturbation~}}
                 {{highlightNewWords input.text ../unperturbedInstance.input.text}}
               {{~else~}}
-                {{input.text}}
+                {{{input.text}}}
               {{~/if~}}
             </div>
           {{/if}}

crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl