PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/scenarios/vision_language/mm_star_scenario.py ADDED Viewed

@@ -0,0 +1,95 @@
+from typing import List
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+class MMStarScenario(Scenario):
+    """
+    MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
+    selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
+    the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
+    are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
+    involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
+    multi-modal capabilities for the solution.
+    Website: https://mmstar-benchmark.github.io/
+    @article{chen2024we,
+      title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
+      author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
+      Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
+      journal={arXiv preprint arXiv:2403.20330},
+      year={2024}
+    }
+    """
+    HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"
+    VALID_CATEGORIES: List[str] = [
+        "coarse perception",
+        "fine-grained perception",
+        "instance reasoning",
+        "logical reasoning",
+        "math",
+        "science technology",
+    ]
+    name = "mm_star"
+    description = (
+        "MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
+        "meticulously selected by humans."
+        "([Chen, 2024](https://arxiv.org/abs/2403.20330))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+    def __init__(self, category: str):
+        super().__init__()
+        category = category.replace("_", " ")
+        if category not in self.VALID_CATEGORIES:
+            raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
+        if category == "science technology":
+            category = "science & technology"
+        self._category: str = category
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
+            # Filter by category
+            category: str = row["category"]
+            if category != self._category:
+                continue
+            # Save the image to disk
+            image = row["image"]
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type="image/jpeg"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
+            )
+        return instances

helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py ADDED Viewed

@@ -0,0 +1,88 @@
+import os
+from typing import List
+from datasets import DatasetDict, load_dataset
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.general import ensure_directory_exists
+from helm.common.media_object import MediaObject, MultimediaObject
+class VQARadScenario(Scenario):
+    """
+    VQARad scenario: Processes a visual question answering dataset with radiology images.
+    Each record in the dataset has:
+    - image
+    - question
+    - answer
+    The output is formatted as:
+    "Answer: <answer>"
+    """
+    HUGGING_FACE_DATASET_PATH: str = "flaviagiammarino/vqa-rad"
+    name = "vqa_rad"
+    description = "Visual question answering with radiology images."
+    tags = [
+        "vision-language",
+        "visual question answering",
+        "reasoning",
+        "medical",
+        "radiology",
+    ]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH)
+        splits = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
+        instances: List[Instance] = []
+        # Iterate over the splits
+        for (
+            helm_split_name,
+            dataset_split_name,
+        ) in splits.items():
+            split_path: str = os.path.join(output_path, dataset_split_name)
+            ensure_directory_exists(split_path)
+            split_data = dataset[dataset_split_name]
+            for index, example in enumerate(split_data):
+                question = example["question"]
+                image = example["image"]
+                answer = example["answer"]
+                # Convert PIL image to MediaObject
+                image_path = os.path.join(split_path, f"{index}.jpg")
+                image.save(image_path)
+                content = [
+                    MediaObject(location=image_path, content_type="image/jpeg"),
+                    MediaObject(text=question, content_type="text/plain"),
+                ]
+                # Format the final answer
+                instances.append(
+                    Instance(
+                        input=Input(multimedia_content=MultimediaObject(content)),
+                        references=[
+                            Reference(
+                                Output(text=answer),
+                                tags=[CORRECT_TAG],
+                            )
+                        ],
+                        split=helm_split_name,
+                    )
+                )
+        return instances

helm/benchmark/scenarios/wikifact_scenario.py CHANGED Viewed

@@ -4,7 +4,17 @@ import json
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
 from helm.common.hierarchical_logger import hlog
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 PID_TO_NAME = {
     "P136": "genre",

helm/benchmark/scenarios/wikitext_103_scenario.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
-from .scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
 class Wikitext103Scenario(Scenario):

helm/benchmark/scenarios/wildbench_scenario.py ADDED Viewed

@@ -0,0 +1,83 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    Input,
+)
+from helm.common.general import ensure_directory_exists
+SUBSETS = ["v2"]
+REFERENCE_MODELS = ["gpt-4-turbo-2024-04-09", "claude-3-haiku-20240307", "Llama-2-70b-chat-hf"]
+class WildBenchScenario(Scenario):
+    """WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
+    WildBench is a benchmark for evaluating large language models (LLMs) on challenging tasks
+    that are more representative of real-world applications. The examples are collected from
+    real users by the AI2 WildChat project."""
+    name = "wildbench"
+    description = "Benchmarking LLMs with Challenging Tasks from Real Users in the Wild"
+    tags = ["instruction following"]
+    def __init__(self, subset: str, use_model_outputs: bool = False):
+        super().__init__()
+        assert subset in SUBSETS, "Unknown subset: {}".format(subset)
+        self.subset = subset
+        self.use_model_outputs = use_model_outputs
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get WildBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "allenai/WildBench",
+            self.subset,
+            cache_dir=cache_dir,
+            split="test",
+            revision="7c05c1b4550282b2ed6a2e6ac5db069f1e07df5c",
+        )
+        assert isinstance(dataset, datasets.Dataset)
+        if self.use_model_outputs:
+            baseline_outputs = {
+                f"{model}": datasets.load_dataset(
+                    "allenai/WildBench-V2-Model-Outputs",
+                    model,
+                    cache_dir=cache_dir,
+                    split="train",
+                    revision="d6755bc68220df853c0825a733430f73f5af2501",
+                )
+                for model in REFERENCE_MODELS
+            }
+            assert all(isinstance(baseline_output, datasets.Dataset) for baseline_output in baseline_outputs.values())
+        # Read all instances
+        instances: List[Instance] = []
+        for idx, row in enumerate(dataset):
+            input = Input(
+                messages=[
+                    {"role": message["role"], "content": message["content"]} for message in row["conversation_input"]
+                ]
+            )
+            extra_data = {
+                "checklist": row["checklist"],
+            }
+            if self.use_model_outputs:
+                extra_data["baseline_outputs"] = {
+                    model: baseline_outputs[model][idx]["output"][0] for model in REFERENCE_MODELS
+                }
+            instance = Instance(
+                input=input,
+                references=[],
+                split=TEST_SPLIT,
+                extra_data=extra_data,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/winogrande_afr_scenario.py ADDED Viewed

@@ -0,0 +1,78 @@
+import csv
+import os
+from typing import Dict, List
+from helm.common.general import ensure_file_downloaded
+from helm.common.hierarchical_logger import hlog
+from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+class Winogrande_Afr_Scenario(Scenario):
+    """
+    https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages
+    """
+    name = "winogrande_afr"
+    description = "Winogrande (S) translated into 11 African low-resource languages"
+    tags = ["knowledge", "multiple_choice", "low_resource_languages"]
+    def __init__(self, lang: str = "af"):
+        super().__init__()
+        self.lang: str = lang
+    def download_winogrande_afr(self, path: str):
+        ensure_file_downloaded(
+            source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip",  # noqa: E501
+            target_path=path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+    def process_csv(self, csv_path: str, split: str, pseudo_split: str) -> List[Instance]:
+        # Match naming in Winogrande
+        if pseudo_split == "val":
+            pseudo_split = "train_s"
+        instances: List[Instance] = []
+        hlog(f"Reading {csv_path}")
+        with open(csv_path) as f:
+            reader = csv.reader(f, delimiter=",")
+            next(reader, None)  # skip the header
+            for row in reader:
+                if row[-1] != pseudo_split:  # ensure correct split is taken
+                    continue
+                question, answers, correct_choice = row[-5], row[-4:-2], row[-2]
+                answers_dict = dict(zip(["1", "2"], answers))
+                correct_answer: str = answers_dict[correct_choice]
+                def answer_to_reference(answer: str) -> Reference:
+                    return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+                instance = Instance(
+                    input=Input(text=question),
+                    references=list(map(answer_to_reference, answers)),
+                    split=split,
+                )
+                instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data
+        desired_dir = "winogrande_s"
+        data_path: str = os.path.join(output_path, desired_dir)
+        self.download_winogrande_afr(data_path)
+        # Read all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "dev": TRAIN_SPLIT,
+            "val": VALID_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for split in splits:
+            csv_path: str = os.path.join(data_path, desired_dir, f"winogrande_{self.lang}.csv")
+            if not os.path.exists(csv_path):
+                hlog(f"{csv_path} doesn't exist, skipping")
+                continue
+            instances.extend(self.process_csv(csv_path, splits[split], split))
+        return instances

helm/benchmark/scenarios/wmt_14_scenario.py CHANGED Viewed

@@ -1,7 +1,17 @@
 from typing import List, Any
 from datasets import load_dataset
 from helm.common.hierarchical_logger import htrack_block
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 MAX_TRAIN_INSTANCES = 20_000  # This is arbitrary, but 20,000 training examples should be enough.
@@ -61,7 +71,9 @@ class WMT14Scenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
             subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
-            hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
+            hf_dataset: Any = load_dataset(
+                "wmt14", subset_name, trust_remote_code=True, revision="b199e406369ec1b7634206d3ded5ba45de2fe696"
+            )
             splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
         instances: List[Instance] = []

helm/benchmark/scenarios/xstest_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List
 from datasets import load_dataset
-from .scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
 class XSTestScenario(Scenario):

helm/benchmark/server.py CHANGED Viewed

@@ -25,11 +25,13 @@ def serve_config():
         return (
             f'window.BENCHMARK_OUTPUT_BASE_URL = "{app.config["helm.outputurl"]}";\n'
             f'window.RELEASE = "{app.config["helm.release"]}";\n'
+            f'window.PROJECT_ID = "{app.config["helm.project"]}";\n'
         )
     else:
         return (
             f'window.BENCHMARK_OUTPUT_BASE_URL = "{app.config["helm.outputurl"]}";\n'
             f'window.SUITE = "{app.config["helm.suite"]}";\n'
+            f'window.PROJECT_ID = "{app.config["helm.project"]}";\n'
         )
@@ -113,6 +115,13 @@ def main():
         default=None,
         help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
     )
+    parser.add_argument(
+        "--project",
+        type=str,
+        default=None,
+        help="Experimental: The name of the project to display on the landing page.",
+    )
     args = parser.parse_args()
     if args.suite and args.release:
@@ -143,6 +152,8 @@ def main():
     app.config["helm.suite"] = args.suite or "latest"
     app.config["helm.release"] = args.release
+    app.config["helm.release"] = args.release
+    app.config["helm.project"] = args.project or "lite"
     print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
     app.run(host="0.0.0.0", port=args.port)

helm/benchmark/slurm_runner.py CHANGED Viewed

@@ -32,7 +32,7 @@ from helm.benchmark.runner_config_registry import RUNNER_CONFIG
 _MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKER_SLURM_JOBS"
 _SLURM_NODE_NAMES_ENV_NAME = "HELM_SLURM_NODE_NAMES"
-_DEFAULT_MAX_CONCURRENT_WORKER_SLURM = 8
+_DEFAULT_MAX_CONCURRENT_WORKER_SLURM = 4
 @dataclass

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl