PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

helm/benchmark/metrics/image_generation/watermark/__init__.py ADDED Viewed

File without changes

helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py ADDED Viewed

@@ -0,0 +1,16 @@
+from typing import List
+import os
+from .watermark_detector import WatermarkDetector
+def test_compute_watermark_probability():
+    watermark_detector = WatermarkDetector()
+    # These test images are from https://github.com/LAION-AI/LAION-5B-WatermarkDetection
+    base_path: str = os.path.join(os.path.dirname(__file__), "test_images")
+    clear_image_path: str = os.path.join(base_path, "clear_example.png")
+    watermark_image_path: str = os.path.join(base_path, "watermark_example.png")
+    has_watermarks: List[bool] = watermark_detector.has_watermark([clear_image_path, watermark_image_path])[0]
+    assert has_watermarks == [False, True]

helm/benchmark/metrics/image_generation/watermark/watermark_detector.py ADDED Viewed

@@ -0,0 +1,87 @@
+import os
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.data
+from torchvision import transforms as T
+from helm.benchmark.runner import get_cached_models_path
+from helm.common.general import ensure_file_downloaded, hlog
+from helm.common.gpu_utils import get_torch_device
+from helm.common.images_utils import open_image
+from helm.common.optional_dependencies import handle_module_not_found_error
+class WatermarkDetector:
+    """
+    We use LAION's watermark detector (https://github.com/LAION-AI/LAION-5B-WatermarkDetection).
+    Adapted from https://github.com/LAION-AI/LAION-5B-WatermarkDetection/blob/main/example_use.py
+    """
+    MODEL_URL: str = "https://github.com/LAION-AI/LAION-5B-WatermarkDetection/raw/main/models/watermark_model_v1.pt"
+    # The example code from LAION used 0.5, but we observed that the watermark detector model could
+    # confuse text in an image as a watermark, so we set the threshold to a higher value of 0.9.
+    # The detector believes that the test example has a watermark with a 93.563% probability.
+    WATERMARK_THRESHOLD: float = 0.9
+    @staticmethod
+    def load_model():
+        """
+        Load the watermark detector model.
+        """
+        try:
+            import timm
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        model = timm.create_model("efficientnet_b3a", pretrained=True, num_classes=2)
+        model.classifier = nn.Sequential(
+            # 1536 is the original in_features
+            nn.Linear(in_features=1536, out_features=625),
+            nn.ReLU(),  # ReLu to be the activation function
+            nn.Dropout(p=0.3),
+            nn.Linear(in_features=625, out_features=256),
+            nn.ReLU(),
+            nn.Linear(in_features=256, out_features=2),
+        )
+        watermark_model_path: str = os.path.join(get_cached_models_path(), "watermark_model_v1.pt")
+        ensure_file_downloaded(WatermarkDetector.MODEL_URL, watermark_model_path)
+        state_dict = torch.load(watermark_model_path)
+        model.load_state_dict(state_dict)
+        model.eval()  # Evaluate the model
+        return model.to(get_torch_device())
+    def __init__(self):
+        self._model = self.load_model()
+    def has_watermark(self, image_locations: List[str]) -> Tuple[List[bool], List[float]]:
+        """
+        Returns a list of booleans indicating whether each image (given by `image_locations`)
+        contains a watermark or not.
+        """
+        # Preprocess images (resize and normalize)
+        images: List[torch.Tensor] = []
+        preprocessing = T.Compose(
+            [T.Resize((256, 256)), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
+        )
+        for location in image_locations:
+            # Location can be a file path or a URL
+            image = preprocessing(open_image(location).convert("RGB"))
+            images.append(image)
+        result: List[bool] = []
+        probs: List[float] = []
+        with torch.no_grad():
+            pred = self._model(torch.stack(images).to(get_torch_device()))
+            syms = F.softmax(pred, dim=1).detach().cpu().numpy().tolist()
+            for i, sym in enumerate(syms):
+                watermark_prob, clear_prob = sym
+                if watermark_prob > self.WATERMARK_THRESHOLD:
+                    hlog(f"Image at {image_locations[i]} has a watermark with {watermark_prob} probability.")
+                result.append(watermark_prob >= self.WATERMARK_THRESHOLD)
+                probs.append(watermark_prob)
+        return result, probs

helm/benchmark/metrics/image_generation/watermark_metrics.py ADDED Viewed

@@ -0,0 +1,48 @@
+from statistics import mean
+from typing import List
+from helm.common.request import RequestResult
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+from .watermark.watermark_detector import WatermarkDetector
+class WatermarkMetric(Metric):
+    """
+    Defines metrics for detecting watermarks in images using the
+    LAION's watermark detector (https://github.com/LAION-AI/LAION-5B-WatermarkDetection).
+    """
+    def __init__(self):
+        self._watermark_detector = WatermarkDetector()
+    def __repr__(self):
+        return "WatermarkMetric()"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        if len(image_locations) == 0:
+            return []
+        # Batch process the images and detect if they have watermarks
+        has_watermarks, watermark_probs = self._watermark_detector.has_watermark(image_locations)
+        stats: List[Stat] = [
+            Stat(MetricName("watermark_frac")).add(mean(has_watermarks) if len(has_watermarks) > 0 else 0),
+            Stat(MetricName("expected_max_watermark_prob")).add(
+                max(watermark_probs) if len(watermark_probs) > 0 else 0
+            ),
+        ]
+        return stats

helm/benchmark/metrics/instruction_following_critique_metrics.py CHANGED Viewed

@@ -73,7 +73,9 @@ class InstructionFollowingCritiqueMetric(Metric):
     }
     KEYWORD_NAME: str = "Keyword Feedback"
-    KEYWORD_PROMPT: str = "Provide a comma-separated list of keywords that capture what's wrong with the response (e.g., typos, swear words, too long)"  # noqa: E501
+    KEYWORD_PROMPT: str = (
+        "Provide a comma-separated list of keywords that capture what's wrong with the response (e.g., typos, swear words, too long)"  # noqa: E501
+    )
     def __init__(self, num_respondents: int) -> None:
         self._template = CritiqueTaskTemplate(

helm/benchmark/metrics/language_modeling_metrics.py ADDED Viewed

@@ -0,0 +1,99 @@
+from collections import defaultdict
+from typing import List, Dict, Set
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.metrics.basic_metrics import (
+    compute_language_modeling_metrics,
+    compute_perplexity_metrics,
+    compute_request_state_metrics,
+)
+from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from .metric import MetricInterface, MetricResult, PerInstanceStats, add_context
+from .metric_name import MetricContext, MetricName
+from .metric_service import MetricService
+from .statistic import Stat, merge_stat
+class LanguageModelingMetric(MetricInterface):
+    """
+    Defines the basic metrics available when using the ADAPT_LANGUAGE_MODELING adapter.
+    This is parallel to BasicMetric and produces many of the same Stats.
+    """
+    def __init__(self, names: List[str]):
+        self.names: List[str] = names
+        self.efficiency_metric = EfficiencyMetric()
+    def __repr__(self):
+        return "LanguageModelingMetric"
+    def evaluate(
+        self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
+    ) -> MetricResult:
+        global_stats: Dict[MetricName, Stat] = {}
+        # The first and only trial
+        trial_stats: Dict[MetricName, Stat] = {}
+        # Per-instance stats
+        all_per_instance_stats: List[PerInstanceStats] = []
+        instance_ids_per_context: Dict[MetricContext, Set[str]] = defaultdict(set)
+        for request_state in scenario_state.request_states:
+            # Evaluate request_state
+            request_stats = self.evaluate_generation(
+                scenario_state.adapter_spec, request_state, metric_service, eval_cache_path
+            )
+            # Add instance-related context (e.g., split, perturbation) to the metrics
+            for i, stat in enumerate(request_stats):
+                context = MetricContext.from_instance(request_state.instance)
+                request_stats[i] = add_context(stat, context)
+                assert request_state.instance.id is not None
+                instance_ids_per_context[context].add(request_state.instance.id)
+            # Use trial index of 0 here since we run only one trial for LM
+            assert request_state.instance.id is not None
+            all_per_instance_stats.append(
+                PerInstanceStats(request_state.instance.id, request_state.instance.perturbation, 0, request_stats)
+            )
+            for stat in request_stats:
+                merge_stat(trial_stats, stat)
+        # group stats according to the context (e.g., split, perturbation) and call derive_stats on each grouping
+        grouped_trial_stats: Dict[MetricContext, Dict[MetricName, Stat]] = defaultdict(dict)
+        for metric_name, stat in trial_stats.items():
+            grouped_trial_stats[MetricContext.from_metric_name(metric_name)][metric_name] = stat  # group by context
+        for context, stats_dict in grouped_trial_stats.items():
+            for stat in self.derive_stats(stats_dict):
+                merge_stat(trial_stats, add_context(stat, context))
+            # keep track of how many instances are in each subset
+            num_instances_stat = Stat(MetricName("num_instances")).add(len(instance_ids_per_context[context]))
+            merge_stat(trial_stats, add_context(num_instances_stat, context))
+        for stat in trial_stats.values():
+            merge_stat(global_stats, stat.take_mean())
+        return MetricResult(list(global_stats.values()), all_per_instance_stats)
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """Compute all metrics."""
+        stats: List[Stat] = []
+        stats.extend(compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service))
+        stats.extend(compute_language_modeling_metrics(adapter_spec, request_state, metric_service))
+        return stats
+    def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
+        """Derive perplexity metrics if applicable. We don't worry about splits and perturbations here."""
+        derived_stats: List[Stat] = []
+        derived_stats.extend(compute_perplexity_metrics(stats_dict))
+        return derived_stats

helm/benchmark/metrics/machine_translation_metrics.py ADDED Viewed

@@ -0,0 +1,89 @@
+from typing import List
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
+from helm.common.optional_dependencies import handle_module_not_found_error
+from .metric_name import MetricName
+from .statistic import Stat
+try:
+    from sacrebleu.metrics import BLEU
+    from langdetect import detect
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e)
+class MachineTranslationMetric(EvaluateInstancesMetric):
+    """
+    Compute the BLEU score for Machine Translation scenarios. The implementation is based on sacrebleu.
+    """
+    def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
+        """
+        Compute the corpus-level metric based on all reqeust_states.
+        """
+        bleu = BLEU()
+        refs: List[List[str]] = [[]]
+        sys: List = []
+        for request_state in request_states:
+            # Assume there is one referece per instance. TODO: Support multiple references after adding more scenarios.
+            num_references: int = len(request_state.instance.references)
+            if num_references != 1:
+                raise ValueError(f"This instance has {num_references} references, but we currently only support one.")
+            # Usually there is only one completion for each instance.
+            assert request_state.result is not None
+            if len(request_state.result.completions) != 1:
+                raise ValueError("Each request result should have only exactly one completion.")
+            sys.append(request_state.result.completions[0].text)
+            refs[0].append(request_state.instance.references[0].output.text)
+        bleu_score = bleu.corpus_score(sys, refs).score
+        return [Stat(MetricName("bleu")).add(bleu_score)]
+class CLEVAMachineTranslationMetric(EvaluateInstancesMetric):
+    """
+    Compute the BLEU score for Machine Translation scenarios of CLEVA benchmark.
+    Based on sacrebleu, this implementation distinguishes target language and allows variable number of references.
+    If there are more than one hypothesis, only the first one is adopted in the calculation.
+    """
+    def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
+        """
+        Compute the corpus-level metric based on all reqeust_states.
+        """
+        def detect_language(request_states: List[RequestState]) -> str:
+            """
+            Determine the target language by detecting the language of references.
+            Currently, it only distinguishes if the target language is Chinese.
+            """
+            corpus: str = "".join(
+                [request_state.instance.references[0].output.text for request_state in request_states[:10]]
+            )
+            if detect(corpus) in ["zh-cn", "zh-tw"]:
+                return "zh"
+            else:
+                return "13a"  # Default tokenizer for sacrebleu.BLEU
+        bleu = BLEU(tokenize=detect_language(request_states))
+        max_num_references: int = max([len(request_state.instance.references) for request_state in request_states])
+        refs: List[List[str]] = [
+            [
+                request_state.instance.references[i].output.text if i < len(request_state.instance.references) else ""
+                for request_state in request_states
+            ]
+            for i in range(max_num_references)
+        ]
+        sys: List = []
+        for request_state in request_states:
+            assert request_state.result is not None
+            sys.append(request_state.result.completions[0].text)
+        bleu_score = bleu.corpus_score(sys, refs).score
+        return [Stat(MetricName("cleva_machine_translation_bleu")).add(bleu_score)]

crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl