PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

crfm-helm 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show

helm/benchmark/metrics/annotation_metrics.py ADDED Viewed

@@ -0,0 +1,108 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class AnnotationLabelMetric(Metric):
+    """Binary metric for labels produced by annotators.
+    Expects the annotation with the given annotator name and key to be a string label.
+    For each possible label in the list of possible labels, produces a
+    corresponding stat with a value of 1 or 0 indicating if the actual label
+    in the annoation."""
+    def __init__(self, annotator_name: str, key: str, labels: List[str]):
+        super().__init__()
+        self.annotator_name = annotator_name
+        self.key = key
+        self.labels = labels
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotation_label = request_state.annotations[self.annotator_name][self.key]
+        if annotation_label not in self.labels:
+            raise ValueError(
+                f"Unrecognized annotation label '{annotation_label}' "
+                f"(known labels: {self.labels}) "
+                f"in annotation {request_state.annotations[self.annotator_name]} "
+                f"for instance id {request_state.instance.id}"
+            )
+        stats: List[Stat] = []
+        for label in self.labels:
+            stats.append(
+                Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}_{label}")).add(
+                    1 if label == annotation_label else 0
+                )
+            )
+        return stats
+class AnnotationNumericMetric(Metric):
+    """Numeric metric for numbers produced by annotators.
+    Expects the annotation with the given annotator name and key to be a number."""
+    def __init__(self, annotator_name: str, key: str):
+        super().__init__()
+        self.annotator_name = annotator_name
+        self.key = key
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        score = request_state.annotations[self.annotator_name][self.key]
+        return [Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}")).add(score)]
+class AnnotationLikertScaleMetric(Metric):
+    """Numeric metric for labels produced by annotators.
+    Expects the annotation with the given annotator name and key to be a string label.
+    For each possible label in the list of possible labels, produces a
+    corresponding stat with a value of 1 or 0 indicating if the actual label
+    in the annoation."""
+    def __init__(self, annotator_name: str, key: str, min_score: int, max_score: int):
+        super().__init__()
+        self.annotator_name = annotator_name
+        self.key = key
+        self.min_score = min_score
+        self.max_score = max_score
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        likert_score = request_state.annotations[self.annotator_name][self.key]
+        if likert_score < self.min_score or likert_score > self.max_score:
+            raise ValueError(
+                f"Likert score {likert_score} "
+                f"out of bounds {self.min_score} to {self.max_score} "
+                f"under key {self.key} and annotator {self.annotator_name} "
+                f"in annotation {request_state.annotations[self.annotator_name]} "
+                f"for instance id {request_state.instance.id}"
+            )
+        normalized_score = (likert_score - self.min_score) / (self.max_score - self.min_score)
+        return [Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}")).add(normalized_score)]

helm/benchmark/metrics/bhasa_metrics.py ADDED Viewed

@@ -0,0 +1,188 @@
+import re
+import string
+from typing import Callable, Dict, List
+from collections import Counter
+from pythainlp.tokenize import word_tokenize
+from sacrebleu.metrics import CHRF
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class BhasaMachineTranslationMetric(Metric):
+    """Machine Translation Metrics
+    This class computes the following standard machine translation metrics
+    1. chr_f_plus_plus (ChrF++)
+    @inproceedings{popovic-2015-chrf,
+        title = "chr{F}: character n-gram {F}-score for automatic {MT} evaluation",
+        author = "Popovi{\'c}, Maja",
+        editor = "Bojar, Ond{\v{r}}ej  and
+        Chatterjee, Rajan  and
+        Federmann, Christian  and
+        Haddow, Barry  and
+        Hokamp, Chris  and
+        Huck, Matthias  and
+        Logacheva, Varvara  and
+        Pecina, Pavel",
+        booktitle = "Proceedings of the Tenth Workshop on Statistical Machine Translation",
+        month = sep,
+        year = "2015",
+        address = "Lisbon, Portugal",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/W15-3049",
+        doi = "10.18653/v1/W15-3049",
+        pages = "392--395",
+        github = "https://github.com/mjpost/sacrebleu",
+    }
+    """
+    def __init__(self):
+        self.chrf_scorer = CHRF(word_order=2)
+    def chr_f_plus_plus(self, refs: List[str], pred: str) -> Dict[str, float]:
+        metrics: Dict[str, float] = {}
+        metrics["chr_f_plus_plus"] = self.chrf_scorer.sentence_score(pred, refs).score
+        return metrics
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        refs: List[str] = [ref.output.text for ref in request_state.instance.references]
+        assert request_state.result is not None
+        pred: str = request_state.result.completions[0].text.strip()
+        result: List[Stat] = []
+        # Compute ChrF++ metrics
+        result.extend(
+            [Stat(MetricName(name)).add(float(val)) for name, val in self.chr_f_plus_plus(refs, pred).items()]
+        )
+        return result
+class BhasaQAMetric(Metric):
+    """Bhasa QA Metrics
+    This class computes the following standard SQuAD v1.1 metrics
+    1. squad_exact_match_score (SQuAD exact match score)
+    2. squad_f1_score (SQuAD macro-averaged F1 score)
+    @inproceedings{rajpurkar-etal-2016-squad,
+        title = "{SQ}u{AD}: 100,000+ Questions for Machine Comprehension of Text",
+        author = "Rajpurkar, Pranav  and
+            Zhang, Jian  and
+            Lopyrev, Konstantin  and
+            Liang, Percy",
+        editor = "Su, Jian  and
+            Duh, Kevin  and
+            Carreras, Xavier",
+        booktitle = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
+        month = nov,
+        year = "2016",
+        address = "Austin, Texas",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/D16-1264",
+        doi = "10.18653/v1/D16-1264",
+        pages = "2383--2392",
+    }
+    """
+    def __init__(self, language: str = "en"):
+        self.language: str = language
+        self.metrics: Dict[str, Callable[[str, str], float]] = {
+            "squad_exact_match_score": self.squad_exact_match_score,
+            "squad_f1_score": self.squad_f1_score,
+        }
+    def normalize_answer(self, text: str) -> List[str]:
+        """
+        For Thai, this will split the text using PyThaiNLP's tokenizer.
+        For all other languages, this will:
+        - Lower text
+        - Remove punctuation
+        - Remove extra whitespace
+        If the language is English, it will
+        - Remove articles "a", "an", and "the"
+        Modifies code from [SQuAD v1.1](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py).
+        """
+        def remove_articles(text: str) -> str:
+            return re.sub(r"\b(a|an|the)\b", " ", text)
+        # This function is implemented to match SQuAD v1.1 behavior
+        def white_space_fix(text: str) -> str:
+            return " ".join(text.split())
+        def remove_punc(text: str) -> str:
+            exclude = set(string.punctuation)
+            return "".join(ch for ch in text if ch not in exclude)
+        def lower(text: str) -> str:
+            return text.lower()
+        normalized_text = remove_punc(lower(text))
+        if self.language == "th":
+            return word_tokenize(normalized_text, engine="newmm")
+        elif self.language == "en":
+            return white_space_fix(remove_articles(normalized_text)).split()
+        else:
+            return white_space_fix(normalized_text).split()
+    def squad_f1_score(self, gold: str, pred: str) -> float:
+        prediction_tokens = self.normalize_answer(pred)
+        ground_truth_tokens = self.normalize_answer(gold)
+        common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0
+        precision = 1.0 * num_same / len(prediction_tokens)
+        recall = 1.0 * num_same / len(ground_truth_tokens)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+    def squad_exact_match_score(self, gold: str, pred: str) -> float:
+        return self.normalize_answer(pred) == self.normalize_answer(gold)
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        stats: List[Stat] = []
+        if len(request_state.instance.references) > 0:
+            golds = [reference for reference in request_state.instance.references if reference.is_correct]
+            assert len(golds) > 0
+            assert request_state.result is not None
+            sorted_completions = sorted(request_state.result.completions, key=lambda x: -x.logprob)
+            preds = [completion.text.strip() for completion in sorted_completions]
+            for name, metric in self.metrics.items():
+                score_1 = max(metric(gold.output.text.strip(), preds[0]) for gold in golds)
+                metrics = [Stat(MetricName(name)).add(score_1)]
+                if adapter_spec.num_outputs != 1:
+                    score_k = max(metric(gold.output.text.strip(), pred) for gold in golds for pred in preds)
+                    metrics.append(Stat(MetricName(f"{name}@{adapter_spec.num_outputs}")).add(score_k))
+                stats.extend(metrics)
+        return stats

helm/benchmark/metrics/bhasa_metrics_specs.py ADDED Viewed

@@ -0,0 +1,10 @@
+from typing import Any, Dict, List
+from helm.benchmark.metrics.metric import MetricSpec
+def get_bhasa_machine_translation_metric_specs() -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.bhasa_metrics.BhasaMachineTranslationMetric")]
+def get_bhasa_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.bhasa_metrics.BhasaQAMetric", args=args)]

helm/benchmark/metrics/code_metrics_helper.py CHANGED Viewed

@@ -27,14 +27,24 @@ import signal
 import sys
 import tempfile
 from typing import List, Union, Dict, Optional
+from types import ModuleType
 from unittest.mock import patch, mock_open
 import numpy as np
-from pyext import RuntimeModule
 from helm.common.hierarchical_logger import hlog
+class RuntimeModule(ModuleType):
+    """crfm-helm's replacement for pyext.RuntimeModule, since pyext is not supported by Python >=3.11"""
+    @staticmethod
+    def from_string(module_name: str, module_doc: str, module_contents: str) -> "RuntimeModule":
+        module = RuntimeModule(module_name, module_doc)
+        exec(module_contents, module.__dict__)
+        return module
 # === APPS evaluation below ===
 class CodeType(Enum):
     call_based = 0

helm/benchmark/metrics/safety_metrics.py ADDED Viewed

@@ -0,0 +1,79 @@
+import numbers
+from typing import Any, Dict, List, cast
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
+from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class SafetyBasicGenerationMetric(Metric):
+    """Replacement for BasicGenerationMetric for HELM Safety.
+    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
+    because we abuse "references" to store metadata rather than true metadata."""
+    def __init__(self):
+        super().__init__()
+        self.efficiency_metric = EfficiencyMetric()
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
+class SafetyScoreMetricException(Exception):
+    pass
+class SafetyScoreMetric(Metric):
+    """Score metrics for HELM Safety."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        # For now, assume there is only one annotator.
+        if not request_state.annotations:
+            raise SafetyScoreMetricException("Request state did not have annotations.")
+        annotator_names = list(request_state.annotations.keys())
+        if len(request_state.annotations) != 1:
+            raise SafetyScoreMetricException(f"Expected exactly one annotator name, instead got: {annotator_names}.")
+        annotation = cast(Dict[str, Any], next(iter(request_state.annotations.values())))
+        safety_score_stat = Stat(MetricName("safety_score"))
+        stats: List[Stat] = []
+        _SCORE_SUFFIX = "_score"
+        model_name_to_score = {
+            key.removesuffix(_SCORE_SUFFIX): value for key, value in annotation.items() if key.endswith(_SCORE_SUFFIX)
+        }
+        for model_name, score in model_name_to_score.items():
+            if score is None:
+                stats.append(Stat(MetricName(f"safety_{model_name}_annotator_success")).add(0))
+            else:
+                if not isinstance(score, numbers.Number):
+                    raise SafetyScoreMetricException(
+                        f"Expected annotation value'{score}' to be a number but it was not."
+                    )
+                stats.extend(
+                    [
+                        Stat(MetricName(f"safety_{model_name}_annotator_success")).add(1),
+                        Stat(MetricName(f"safety_{model_name}_score")).add(score),
+                    ]
+                )
+                safety_score_stat.add(score)
+        if safety_score_stat.count == 0:
+            raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
+        stats.append(safety_score_stat)
+        return stats

helm/benchmark/metrics/summac/model_summac.py CHANGED Viewed

@@ -179,9 +179,9 @@ class SummaCImager:
                     model_outputs = self.model(**batch_tokens)
                 batch_probs = torch.nn.functional.softmax(model_outputs["logits"], dim=-1)
-                batch_evids = batch_probs[:, self.entailment_idx].tolist()
-                batch_conts = batch_probs[:, self.contradiction_idx].tolist()
-                batch_neuts = batch_probs[:, self.neutral_idx].tolist()
+                batch_evids = batch_probs[:, self.entailment_idx].tolist()  # type: ignore
+                batch_conts = batch_probs[:, self.contradiction_idx].tolist()  # type: ignore
+                batch_neuts = batch_probs[:, self.neutral_idx].tolist()  # type: ignore
             for b, evid, cont, neut in zip(batch, batch_evids, batch_conts, batch_neuts):
                 image[0, b["doc_i"], b["gen_i"]] = evid

helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py CHANGED Viewed

@@ -10,8 +10,8 @@ class TestAI21TokenCostEstimator:
     def test_estimate_tokens(self):
         request = Request(
-            model="openai/text-davinci-002",
-            model_deployment="openai/text-davinci-002",
+            model="ai21/jamba-instruct",
+            model_deployment="ai21/jamba-instruct",
             prompt="The Center for Research on Foundation Models (CRFM) is "
             "an interdisciplinary initiative born out of the Stanford "
             "Institute for Human-Centered Artificial Intelligence (HAI) "

helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py CHANGED Viewed

@@ -37,8 +37,8 @@ class TestOpenAITokenCostEstimator:
     def test_estimate_tokens(self):
         request = Request(
-            model="openai/text-davinci-002",
-            model_deployment="openai/text-davinci-002",
+            model="openai/davinci-002",
+            model_deployment="openai/davinci-002",
             prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
             num_completions=3,
             max_tokens=100,
@@ -49,8 +49,8 @@ class TestOpenAITokenCostEstimator:
     def test_estimate_tokens_with_echo_prompt(self):
         request = Request(
-            model="openai/text-davinci-002",
-            model_deployment="openai/text-davinci-002",
+            model="openai/davinci-002",
+            model_deployment="openai/davinci-002",
             prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
             echo_prompt=True,
             num_completions=1,

helm/benchmark/metrics/unitxt_metrics.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import numbers
 import re
-from typing import Dict, List
+from typing import Dict, List, Set
 from datasets import load_dataset
 import evaluate
+from helm.common.general import hlog
 from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.metrics.metric_name import MetricName
@@ -42,6 +44,7 @@ class UnitxtMetric(MetricInterface):
         )
         # Extract instance metrics
+        non_number_instance_metric_names: Set[str] = set()
         per_instance_stats: List[PerInstanceStats] = []
         for request_state, evaluate_result in zip(scenario_state.request_states, evaluate_results):
             instance = request_state.instance
@@ -60,9 +63,15 @@ class UnitxtMetric(MetricInterface):
                 )
                 if isinstance(metric_score, list):
                     for metric_score_element in metric_score:
-                        stat = stat.add(metric_score_element)
+                        if isinstance(metric_score_element, numbers.Number):
+                            stat = stat.add(metric_score_element)
+                        else:
+                            non_number_instance_metric_names.add(metric_name)
                 else:
-                    stat = stat.add(metric_score)
+                    if isinstance(metric_score, numbers.Number):
+                        stat = stat.add(metric_score)
+                    else:
+                        non_number_instance_metric_names.add(metric_name)
                 instance_stats.append(stat)
             assert instance.id
             per_instance_stats.append(
@@ -73,6 +82,11 @@ class UnitxtMetric(MetricInterface):
                     stats=instance_stats,
                 )
             )
+        if non_number_instance_metric_names:
+            hlog(
+                "WARNING: Ignored Unitxt instance metrics because "
+                f"they were not numbers: {non_number_instance_metric_names}"
+            )
         # Extract global metrics
         aggregated_stats: List[Stat] = []

helm/benchmark/metrics/vision_language/image_metrics.py CHANGED Viewed

@@ -35,7 +35,7 @@ try:
     from PIL import Image
     import imagehash
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 def pad(small_image: Image.Image, large_image: Image.Image, axis: int) -> Image.Image:
@@ -303,7 +303,10 @@ class AnnotatedImageMetrics(Metric):
         if self._lpips_metric is None:
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=UserWarning)
-                self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg").to(self._device)
+                # https://lightning.ai/docs/torchmetrics/stable/image/learned_perceptual_image_patch_similarity.html
+                self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg", normalize=True).to(
+                    self._device
+                )
         preprocessing = transforms.Compose(
             [
@@ -400,7 +403,8 @@ class AnnotatedImageMetrics(Metric):
     def compute_ssim(self, generated_image: np.ndarray, reference_image: np.ndarray) -> float:
         """Compute the Structural Similarity Index (SSIM) between the generated and reference images."""
-        return ssim(generated_image, reference_image)
+        # Add 1 and divide by 2 to get a normalized score between 0 and 1, where 1 is the most similar
+        return (ssim(generated_image, reference_image) + 1) / 2
     def compute_edit_sim(self, completion: str, reference: str) -> float:
         # `reference` is the entire remaining book for each instance.

helm/benchmark/metrics/vision_language/image_utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ try:
     import cv2
     from PIL.Image import Image
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["image2structure"])
+    handle_module_not_found_error(e, suggestions=["image2struct"])
 def preprocess_image(image: Image) -> np.ndarray:

helm/benchmark/model_metadata_registry.py CHANGED Viewed

@@ -22,9 +22,6 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 # OpenAI Chat format
 OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
-# Mistral instruction-following format
-MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
 # For Anthropic models
 ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
 ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -69,6 +66,9 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
 LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
 FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
+# Deprecated models that are no longer available.
+# These are usually closed API models that have been permanently removed
+DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -14,7 +14,6 @@ from helm.benchmark.config_registry import register_builtin_configs_from_helm_pa
 from helm.common.hierarchical_logger import hlog
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
-from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
 try:
     import colorcet
@@ -39,6 +38,7 @@ metric_group_to_label = {
     "Efficiency": f"Inference time (s) {DOWN_ARROW}",
 }
 all_metric_groups = list(metric_group_to_label.keys())
+AGGREGATE_WIN_RATE_COLUMN = 1
 @dataclass

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -119,6 +119,9 @@ class MetricGroup(Field):
     hide_win_rates: Optional[bool] = None
     """If set to true, do not compute win rates."""
+    aggregation_strategies: Optional[List[str]] = None
+    """List with values in {'win_rate','mean'} that correspond to aggregations"""
 BY_METRIC = "by_metric"
 BY_GROUP = "by_group"

crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl