PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

helm/benchmark/metrics/image_generation/detectors/vitdet.py ADDED Viewed

@@ -0,0 +1,178 @@
+import os
+from typing import Dict, Any
+import torch
+from helm.benchmark.runner import get_cached_models_path
+from helm.common.general import ensure_file_downloaded, hlog
+from helm.common.images_utils import open_image
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.gpu_utils import get_torch_device
+from .base_detector import BaseDetector
+MODEL_CONFIG_DOWNLOAD_URL: str = "https://drive.google.com/uc?id=1MLuwQ0ZN0gJQ42oVCc0aFz6Rneb1g3Rt"
+MODEL_CHECKPOINT_DOWNLOAD_URL: str = (
+    "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/mask_rcnn_vitdet_b/f325346929/model_final_61ccd1.pkl"
+)
+class ViTDetDetector(BaseDetector):
+    def __init__(self):
+        try:
+            from detectron2.checkpoint import DetectionCheckpointer
+            from detectron2.config import LazyConfig
+            from detectron2.config import instantiate
+            from detectron2.data.catalog import MetadataCatalog
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        super().__init__()
+        cache_path: str = get_cached_models_path()
+        cfg_path: str = os.path.join(cache_path, "vitdet_model.yaml")
+        ensure_file_downloaded(source_url=MODEL_CONFIG_DOWNLOAD_URL, target_path=cfg_path)
+        cfg = LazyConfig.load(cfg_path)
+        model_path: str = os.path.join(cache_path, "vitdet_model.pkl")
+        ensure_file_downloaded(source_url=MODEL_CHECKPOINT_DOWNLOAD_URL, target_path=model_path)
+        cfg.train.init_checkpoint = model_path
+        model = instantiate(cfg.model).cuda()
+        model = model.eval()
+        for p in model.parameters():
+            p.requires_grad = False
+        DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
+        self._cfg = cfg
+        self._model = model
+        self._device: torch.device = get_torch_device()
+        hlog("Initialized the ViTDet model.")
+        # COCO classes
+        self._coco_classes = MetadataCatalog.get("coco_2017_val").thing_classes
+    def forward_model(self, image_location: str) -> float:
+        try:
+            from detectron2.data.common import DatasetFromList, MapDataset
+            from detectron2.config import instantiate
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        image = open_image(image_location)
+        dataset_dicts = [
+            {
+                "file_name": image_location,
+                "width": image.width,
+                "height": image.height,
+            }
+        ]
+        dataset = DatasetFromList(dataset_dicts, copy=False)
+        mapper = instantiate(self._cfg.dataloader.test.mapper)
+        dataset = MapDataset(dataset, mapper)
+        inputs = [dataset[0]]
+        outputs = self._model(inputs)
+        return outputs[0]["instances"]
+    def compute_score(self, caption: str, image_location: str, references: Dict[str, Any]) -> float:
+        # hlog(f'compute score for prompt: {caption}, file: {image_location}, skill: {references["skill"]}')
+        instances = self.forward_model(image_location)
+        if references["skill"] == "object":
+            return self.compute_score_object(instances, references)
+        if references["skill"] == "count":
+            return self.compute_score_count(instances, references)
+        if references["skill"] == "spatial":
+            return self.compute_score_spatial(instances, references)
+        raise NotImplementedError(references["skill"])
+    def compute_score_object(self, instances, references):
+        gt_class_name = references["object"]
+        gt_class = self._coco_classes.index(gt_class_name)
+        if len(instances.scores) == 0:
+            pred_id = None
+            pred_score = torch.zeros(())
+            pred_class = None
+            pred_class_name = None
+            correct = 0.0
+        else:
+            pred_id = instances.scores.max(-1).indices
+            pred_score = instances.scores[pred_id]  # (num_instances,) -> ()    # noqa
+            pred_class = instances.pred_classes[pred_id]  # (num_instances,) -> ()
+            pred_class_name = self._coco_classes[pred_class.item()]  # noqa
+            correct = float(pred_class == gt_class)
+        # hlog(f"pred_class: {pred_class_name}, gt_class: {gt_class_name}, correct: {correct}")
+        return correct
+    def compute_score_count(self, instances, references):
+        # assume that there is only one type of object
+        gt_class_name = references["object"]
+        gt_class_idx = self._coco_classes.index(gt_class_name)
+        gt_count = references["count"]
+        if len(instances.scores) == 0:
+            pred_count = 0
+            correct = 0.0
+        else:
+            pred_count = (instances.pred_classes == gt_class_idx).sum().item()
+            correct = float(pred_count == gt_count)
+        return correct
+    def compute_score_spatial(self, instances, references):
+        gt_class_name_1, gt_class_name_2 = references["objects"]
+        gt_class_idx_1 = self._coco_classes.index(gt_class_name_1)
+        gt_class_idx_2 = self._coco_classes.index(gt_class_name_2)
+        relation = references["relation"].split("_")[0]
+        if len(instances.scores) == 0:
+            correct = 0
+            pred_rel = "no_pred"
+        else:
+            pred_count_1 = (instances.pred_classes == gt_class_idx_1).sum().item()
+            pred_count_2 = (instances.pred_classes == gt_class_idx_2).sum().item()
+            if pred_count_1 != 1 or pred_count_2 != 1:
+                correct = 0
+                pred_rel = "obj_count_mismatch"
+            else:
+                x11, y11 = instances.pred_boxes[instances.pred_classes == gt_class_idx_1].tensor[0, :2]
+                x21, y21 = instances.pred_boxes[instances.pred_classes == gt_class_idx_2].tensor[0, :2]
+                x_diff = x11 - x21
+                y_diff = y11 - y21
+                # FIXME: The code below mimics dall-eval logic. I don't think
+                # we need to follow it. Does the case of two objects of same
+                # category make sense? Also, I don't know why we need to
+                # to ensure something is more "right" than it is "above".
+                if gt_class_name_1 == gt_class_name_2:
+                    if abs(x_diff) > abs(y_diff):
+                        if relation in ["left", "right"]:
+                            correct = 1
+                            pred_rel = "relation_correct"
+                        else:
+                            pred_rel = "relation_incorrect"
+                            correct = 0
+                    else:
+                        if relation in ["above", "below"]:
+                            pred_rel = "relation_correct"
+                            correct = 1
+                        else:
+                            pred_rel = "relation_incorrect"
+                            correct = 0
+                else:
+                    if abs(x_diff) > abs(y_diff):
+                        if x11 < x21:
+                            pred_rel = "right"
+                        else:
+                            pred_rel = "left"
+                    else:
+                        if y11 > y21:
+                            pred_rel = "above"
+                        else:
+                            pred_rel = "below"
+                    if relation == pred_rel:
+                        correct = 1
+                    else:
+                        correct = 0
+        return correct

helm/benchmark/metrics/image_generation/efficiency_metrics.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import List
+from helm.common.request import RequestResult
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+class EfficiencyMetric(Metric):
+    """
+    Defines the efficiency metrics for text-to-image models.
+    """
+    def __repr__(self):
+        return "EfficiencyMetric()"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        prompt: str = request_state.request.prompt
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        if len(image_locations) == 0:
+            return []
+        # inference_runtime is computed in BasicMetric
+        stats: List[Stat] = [
+            Stat(MetricName("prompt_length")).add(len(prompt)),
+            Stat(MetricName("num_generated_images")).add(len(request_result.completions)),
+        ]
+        return stats

helm/benchmark/metrics/image_generation/fidelity_metrics.py ADDED Viewed

@@ -0,0 +1,168 @@
+from tqdm import tqdm
+from typing import Dict, List, Set, Optional
+import math
+import os
+import shutil
+from helm.common.general import ensure_directory_exists, generate_unique_id, get_file_name, hlog
+from helm.common.gpu_utils import is_cuda_available, get_torch_device
+from helm.common.request import RequestResult
+from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.images_utils import is_blacked_out_image, copy_image
+from helm.common.optional_dependencies import handle_module_not_found_error
+class FidelityMetric(MetricInterface):
+    """
+    Frechet Inception Distance (FID) is a measure of similarity between two sets of images.
+    Inception Score (IS) measures quality and diversity of images.
+    Both metrics require a large number of samples to compute.
+    @misc{Seitzer2020FID,
+      author={Maximilian Seitzer},
+      title={{pytorch-fid: FID Score for PyTorch}},
+      month={August},
+      year={2020},
+      note={Version 0.3.0},
+      howpublished={https://github.com/mseitzer/pytorch-fid},
+    }
+    @misc{obukhov2020torchfidelity,
+      author={Anton Obukhov and Maximilian Seitzer and Po-Wei Wu and Semen Zhydenko and Jonathan Kyl
+              and Elvis Yu-Jing Lin},
+      year=2020,
+      title={High-fidelity performance metrics for generative models in PyTorch},
+      url={https://github.com/toshas/torch-fidelity},
+      publisher={Zenodo},
+      version={v0.3.0},
+      doi={10.5281/zenodo.4957738},
+      note={Version: 0.3.0, DOI: 10.5281/zenodo.4957738}
+    }
+    """
+    IMAGE_WIDTH: int = 512
+    IMAGE_HEIGHT: int = 512
+    def __repr__(self):
+        return "FidelityMetric()"
+    def evaluate(
+        self,
+        scenario_state: ScenarioState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+        parallelism: int,
+    ) -> MetricResult:
+        try:
+            import torch_fidelity
+            from pytorch_fid.fid_score import calculate_fid_given_paths
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        dest_path: str
+        unique_perturbations: Set[Optional[PerturbationDescription]] = set()
+        gold_images_path: str = os.path.join(eval_cache_path, generate_unique_id())
+        ensure_directory_exists(gold_images_path)
+        # The library requires the gold and generated images to be in two separate directories.
+        # Gather the gold images and the unique perturbations
+        num_gold_images: int = 0
+        for request_state in tqdm(scenario_state.request_states):
+            instance: Instance = request_state.instance
+            unique_perturbations.add(instance.perturbation)
+            for reference in instance.references:
+                if not reference.is_correct:
+                    continue
+                assert (
+                    reference.output.multimedia_content is not None
+                    and reference.output.multimedia_content.media_objects[0].location is not None
+                )
+                file_path: str = reference.output.multimedia_content.media_objects[0].location
+                dest_path = os.path.join(gold_images_path, get_file_name(file_path))
+                copy_image(file_path, dest_path, width=self.IMAGE_WIDTH, height=self.IMAGE_HEIGHT)
+                num_gold_images += 1
+        hlog(f"Resized {num_gold_images} gold images to {self.IMAGE_WIDTH}x{self.IMAGE_HEIGHT}.")
+        # Compute the FID for each perturbation group
+        stats: List[Stat] = []
+        for perturbation in unique_perturbations:
+            perturbation_name: str = "" if perturbation is None else str(perturbation)
+            generated_images_path: str = os.path.join(eval_cache_path, generate_unique_id())
+            ensure_directory_exists(generated_images_path)
+            num_generated_images: int = 0
+            for request_state in tqdm(scenario_state.request_states):
+                if request_state.instance.perturbation != perturbation:
+                    continue
+                assert request_state.result is not None
+                request_result: RequestResult = request_state.result
+                # Gather the model-generated images
+                for image in request_result.completions:
+                    assert image.multimodal_content is not None
+                    location = image.multimodal_content.media_objects[0].location
+                    if location is not None and not is_blacked_out_image(location):
+                        dest_path = os.path.join(generated_images_path, get_file_name(location))
+                        copy_image(location, dest_path, width=self.IMAGE_WIDTH, height=self.IMAGE_HEIGHT)
+                        num_generated_images += 1
+            compute_kid: bool = num_generated_images >= 1000
+            hlog(f"Resized {num_generated_images} images to {self.IMAGE_WIDTH}x{self.IMAGE_HEIGHT}.")
+            try:
+                hlog(f"Computing FID between {generated_images_path} and {gold_images_path}...")
+                fid: float = calculate_fid_given_paths(
+                    paths=[generated_images_path, gold_images_path],
+                    device=get_torch_device(),
+                    # Following defaults set in
+                    # https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py#L54
+                    batch_size=50,
+                    dims=2048,
+                    num_workers=8,
+                )
+                hlog(f"Done. FID score: {fid}")
+                # The torch_fidelity library fails when there are too few images (i.e., `max_eval_instances` is small).
+                hlog("Computing the other fidelity metrics...")
+                metrics_dict: Dict[str, float] = torch_fidelity.calculate_metrics(
+                    input1=generated_images_path,
+                    input2=gold_images_path,
+                    isc=True,
+                    fid=False,
+                    kid=compute_kid,
+                    ppl=False,  # Requires `GenerativeModel`
+                    cuda=is_cuda_available(),
+                    save_cpu_ram=not is_cuda_available(),
+                )
+                inception_score: float = metrics_dict["inception_score_mean"]
+                if math.isnan(inception_score):
+                    inception_score = 0
+                stats.extend(
+                    [
+                        Stat(MetricName("fid", perturbation=perturbation)).add(fid),
+                        Stat(MetricName("inception_score", perturbation=perturbation)).add(inception_score),
+                    ]
+                )
+                if compute_kid:
+                    kid: float = metrics_dict["kernel_inception_distance_mean"]
+                    stats.append(Stat(MetricName("kernel_inception_distance", perturbation=perturbation)).add(kid))
+            except AssertionError as e:
+                hlog(f"Error occurred when computing fidelity metrics for perturbation: {perturbation_name} Error: {e}")
+            shutil.rmtree(generated_images_path)
+        # Delete the gold images directory
+        shutil.rmtree(gold_images_path)
+        return MetricResult(aggregated_stats=stats, per_instance_stats=[])

helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py ADDED Viewed

File without changes

helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py ADDED Viewed

@@ -0,0 +1,63 @@
+import numpy as np
+from helm.common.optional_dependencies import handle_module_not_found_error
+def compute_fractal_dimension(image_path: str) -> float:
+    """
+    Compute the fractal coefficient of an image.
+    From https://en.wikipedia.org/wiki/Minkowski–Bouligand_dimension, in fractal
+    geometry, the Minkowski–Bouligand dimension, also known as Minkowski dimension
+    or box-counting dimension, is a way of determining the fractal dimension of a
+    set S in a Euclidean space Rn, or more generally in a metric space (X, d).
+    Adapted from https://gist.github.com/viveksck/1110dfca01e4ec2c608515f0d5a5b1d1.
+    :param image_path: Path to the image.
+    """
+    def fractal_dimension(Z, threshold=0.2):
+        # Only for 2d image
+        assert len(Z.shape) == 2
+        # From https://github.com/rougier/numpy-100 (#87)
+        def boxcount(Z, k):
+            S = np.add.reduceat(
+                np.add.reduceat(Z, np.arange(0, Z.shape[0], k), axis=0), np.arange(0, Z.shape[1], k), axis=1
+            )
+            # We count non-empty (0) and non-full boxes (k*k)
+            return len(np.where((S > 0) & (S < k * k))[0])
+        # Transform Z into a binary array
+        Z = Z < threshold
+        # Minimal dimension of image
+        p = min(Z.shape)
+        # Greatest power of 2 less than or equal to p
+        n = 2 ** np.floor(np.log(p) / np.log(2))
+        # Extract the exponent
+        n = int(np.log(n) / np.log(2))
+        # Build successive box sizes (from 2**n down to 2**1)
+        sizes = 2 ** np.arange(n, 1, -1)
+        # Actual box counting with decreasing size
+        counts = []
+        for size in sizes:
+            counts.append(boxcount(Z, size))
+        # Fit the successive log(sizes) with log (counts)
+        coeffs = np.polyfit(np.log(sizes), np.log(counts), 1)
+        return -coeffs[0]
+    try:
+        import cv2
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["heim"])
+    image = cv2.imread(image_path, 0) / 255.0  # type: ignore
+    assert image.min() >= 0 and image.max() <= 1
+    return fractal_dimension(image)

helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py ADDED Viewed

@@ -0,0 +1,33 @@
+import os
+from .fractal_dimension_util import compute_fractal_dimension
+def fractal_dimension_test(image_filename: str, expected_fractal_dimension: float):
+    image_path: str = os.path.join(os.path.dirname(__file__), "test_images", image_filename)
+    dim: float = compute_fractal_dimension(image_path)
+    assert round(dim, 2) == expected_fractal_dimension
+# Test case are inspired by https://www.sciencedirect.com/science/article/pii/S0097849303001547
+def test_compute_fractal_dimension_cloud():
+    # Clouds have a fractal dimension (D) of 1.30-1.33.
+    fractal_dimension_test("cloud.png", 1.34)
+def test_compute_fractal_dimension_sea_anemone():
+    # Sea anemones have a D of 1.6.
+    fractal_dimension_test("sea_anemone.png", 1.54)
+def test_compute_fractal_dimension_snowflake():
+    # Snowflakes have a D of 1.7.
+    fractal_dimension_test("snowflakes.png", 1.69)
+def test_compute_fractal_dimension_convergence():
+    # "Pollock continued to drip paint for a period lasting up to six months, depositing layer upon layer,
+    # and gradually creating a highly dense fractal pattern. As a result, the D value of his paintings rose
+    # gradually as they neared completion, starting in the range of 1.3–1.5 for the initial springboard layer
+    # and reaching a final value as high as 1.9". Convergence was produced in 1952 by Jackson Pollock.
+    fractal_dimension_test("convergence.png", 1.83)

helm/benchmark/metrics/image_generation/fractal_dimension_metric.py ADDED Viewed

@@ -0,0 +1,50 @@
+import math
+from statistics import mean
+from typing import List
+from helm.common.request import RequestResult
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+from .fractal_dimension.fractal_dimension_util import compute_fractal_dimension
+class FractalDimensionMetric(Metric):
+    # From https://www.nature.com/articles/35065154, "participants in the perception study consistently
+    # preferred fractals with D values in the range of 1.3 to 1.5, irrespective of the pattern's origin.
+    # Significantly, many of the fractal patterns surrounding us in nature have D values in this range.
+    # Clouds have a value of 1.3."
+    IDEAL_FRACTAL_DIMENSION: float = 1.4
+    def __repr__(self):
+        return "FractalDimensionMetric()"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        if len(image_locations) == 0:
+            return []
+        fractal_dimensions: List[float] = [
+            compute_fractal_dimension(image_location) for image_location in image_locations
+        ]
+        fractal_dimension_losses: List[float] = [
+            abs(dim - self.IDEAL_FRACTAL_DIMENSION) for dim in fractal_dimensions if not math.isnan(dim)
+        ]
+        stats: List[Stat] = []
+        if len(fractal_dimension_losses) > 0:
+            stats.append(Stat(MetricName("fractal_dimension_loss")).add(mean(fractal_dimension_losses)))
+        return stats

helm/benchmark/metrics/image_generation/gender_metrics.py ADDED Viewed

@@ -0,0 +1,58 @@
+from statistics import mean
+from typing import List
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.request import RequestResult
+from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
+from helm.common.images_utils import is_blacked_out_image
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+class GenderMetric(Metric):
+    """
+    Following https://arxiv.org/abs/2202.04053, use the CLIP score to determine if the photo
+    is of a female or male.
+    """
+    MALE_PROMPT: str = "a photo of a male"
+    FEMALE_PROMPT: str = "a photo of a female"
+    IDEAL_FRAC: float = 0.5
+    def __repr__(self):
+        return "GenderMetric()"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        is_female_results: List[bool] = [
+            self._is_photo_of_female(metric_service, location)
+            for location in image_locations
+            if location is not None and not is_blacked_out_image(location)
+        ]
+        female_frac: float = mean(is_female_results) if len(is_female_results) > 0 else 0
+        stats: List[Stat] = [Stat(MetricName("gender_imbalance")).add(abs(female_frac - self.IDEAL_FRAC))]
+        return stats
+    def _is_photo_of_female(self, metric_service: MetricService, image_location: str) -> bool:
+        def make_clip_score_request(prompt: str) -> float:
+            result: CLIPScoreResult = metric_service.compute_clip_score(CLIPScoreRequest(prompt, image_location))
+            return result.score
+        female_clip_score: float = make_clip_score_request(self.FEMALE_PROMPT)
+        male_clip_score: float = make_clip_score_request(self.MALE_PROMPT)
+        return female_clip_score > male_clip_score

crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl