PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/metrics/cleva_harms_metrics.py CHANGED Viewed

@@ -14,12 +14,13 @@ from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
 from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
 from helm.common.optional_dependencies import handle_module_not_found_error
-from .bias_metrics import BiasMetric
-from .toxicity_metrics import ToxicityMetric
-from .copyright_metrics import BasicCopyrightMetric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.bias_metrics import BiasMetric
+from helm.benchmark.metrics.toxicity_metrics import ToxicityMetric
+from helm.benchmark.metrics.copyright_metrics import BasicCopyrightMetric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.runner import get_benchmark_output_path
 try:
     import jieba
@@ -71,7 +72,7 @@ class CLEVABiasMetric(BiasMetric):
                   "demographic_category". One of "adjective" or "profession".
         """
         # Ensure all necessary data are downloaded
-        self.output_dir = os.path.join("benchmark_output", "metrics", self.name)
+        self.output_dir = os.path.join(get_benchmark_output_path(), "metrics", self.name)
         ensure_directory_exists(self.output_dir)
         for filename in self.FILE_NAMES:
             target_path = os.path.join(self.output_dir, filename)

helm/benchmark/metrics/code_metrics.py CHANGED Viewed

@@ -10,11 +10,11 @@ from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.scenarios.code_scenario import CodeReference
-from . import code_metrics_helper
-from .metric import Metric, MetricResult
-from .metric_service import MetricService
-from .metric_name import MetricName
-from .statistic import Stat
+from helm.benchmark.metrics import code_metrics_helper
+from helm.benchmark.metrics.metric import Metric, MetricResult
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
 MAXIMUM_MEMORY_BYTES = 8 * 1024 * 1024 * 1024  # 8GB.

helm/benchmark/metrics/comet_metric.py ADDED Viewed

@@ -0,0 +1,125 @@
+import logging
+from typing import List
+import comet
+from torch import nn
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.metrics.metric import Metric, MetricResult
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hlog
+from helm.common.request import RequestResult
+class CometMetric(Metric):
+    """COMET machine translation metric using a regression model.
+    The model takes a triplet of source sentence, translation, and reference
+    and computes a score in the range [0, 1] reflecting the quality of the predicted
+    translation.
+    Paper:
+    @inproceedings{rei-etal-2022-comet,
+        title = "{COMET}-22: Unbabel-{IST} 2022 Submission for the Metrics Shared Task",
+        author = "Rei, Ricardo  and
+            C. de Souza, Jos{\'e} G.  and
+            Alves, Duarte  and
+            Zerva, Chrysoula  and
+            Farinha, Ana C  and
+            Glushkova, Taisiya  and
+            Lavie, Alon  and
+            Coheur, Luisa  and
+            Martins, Andr{\'e} F. T.",
+            editor = {Koehn, Philipp  and
+            Barrault, Lo{\"\i}c  and
+            Bojar, Ond{\v{r}}ej  and
+            Bougares, Fethi  and
+            Chatterjee, Rajen  and
+            Costa-juss{\`a}, Marta R.  and
+            Federmann, Christian  and
+            Fishel, Mark  and
+            Fraser, Alexander  and
+            Freitag, Markus  and
+            Graham, Yvette  and
+            Grundkiewicz, Roman  and
+            Guzman, Paco  and
+            Haddow, Barry  and
+            Huck, Matthias  and
+            Jimeno Yepes, Antonio  and
+            Kocmi, Tom  and
+            Martins, Andr{\'e}  and
+            Morishita, Makoto  and
+            Monz, Christof  and
+            Nagata, Masaaki  and
+            Nakazawa, Toshiaki  and
+            Negri, Matteo  and
+            N{\'e}v{\'e}ol, Aur{\'e}lie  and
+            Neves, Mariana  and
+            Popel, Martin  and
+            Turchi, Marco  and
+            Zampieri, Marcos},
+        booktitle = "Proceedings of the Seventh Conference on Machine Translation (WMT)",
+        month = dec,
+        year = "2022",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2022.wmt-1.52",
+    }
+    """
+    METRIC_NAME = "comet"
+    def __init__(self, task: str, model_name: str = "Unbabel/wmt22-comet-da", device: str = "cpu"):
+        self.model_name = model_name
+        self.comet_scorer: nn.Module = self._load_model(model_name)
+        self.num_gpus = 0 if device == "cpu" else 1
+        # suppress warnings from PyTorch Lightning which spams terminal
+        logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
+        logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.WARNING)
+    @staticmethod
+    def _load_model(model_name: str) -> nn.Module:
+        """Load Comet model from the checkpoint.
+        Returns:
+            The loaded model.
+        """
+        return comet.load_from_checkpoint(comet.download_model(model_name))
+    def evaluate(
+        self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
+    ) -> MetricResult:
+        hlog(
+            f"Setting parallelism from {parallelism} to 1, since "
+            f"evaluating {self.__class__.__name__} with parallelism > 1 seg faults."
+        )
+        return super().evaluate(scenario_state, metric_service, eval_cache_path, parallelism=1)
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """Compute the COMET score for this instance"""
+        assert len(request_state.instance.references) == 1
+        ref = request_state.instance.references[0].output.text
+        src = request_state.instance.input.text
+        result = request_state.result
+        if not isinstance(result, RequestResult):
+            raise TypeError(f"Expected a valid result, but got {result}!")
+        mt = result.completions[0].text.strip()
+        # comet requires this exac5 format
+        data = [dict(ref=ref, src=src, mt=mt)]
+        output = self.comet_scorer.predict(data, gpus=self.num_gpus, progress_bar=False)  # type: ignore
+        comet_score = output[0][0]  # extract the actual score
+        metric_result = [Stat(MetricName(self.METRIC_NAME)).add(comet_score)]
+        return metric_result

helm/benchmark/metrics/common_metric_specs.py CHANGED Viewed

@@ -44,11 +44,18 @@ def get_language_modeling_metric_specs(names: List[str]) -> List[MetricSpec]:
     ]
-def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
+def get_classification_metric_specs(
+    labels: Optional[List[str]] = None, delimiter: Optional[str] = None
+) -> List[MetricSpec]:
+    extra_args: Dict[str, Any] = {}
+    if labels:
+        extra_args["labels"] = labels
+    if delimiter:
+        extra_args["delimiter"] = delimiter
     return [
         MetricSpec(
             class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
-            args={"delimiter": delimiter},
+            args=extra_args,
         )
     ]

helm/benchmark/metrics/conv_fin_qa_calc_metrics.py ADDED Viewed

@@ -0,0 +1,72 @@
+import re
+from typing import Any, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.scenarios.scenario import CORRECT_TAG
+from helm.common.hierarchical_logger import hlog
+def _strip_string(str: str) -> Any:
+    # from https://stackoverflow.com/a/4703508
+    numeric_const_pattern = r"[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?"
+    match = re.search(numeric_const_pattern, str)
+    if match:
+        try:
+            return float(str[match.start() : match.end()])
+        except Exception:
+            return None
+    return None
+def float_equiv(str1: str, str2: str, eps: float = 1e-6) -> float:
+    """Check if two values have the same float value, up to a small tolerance.
+    This is the implementation used in the IBM Enterprise Benchmark paper.
+    Note: This is a "mostly-correct" equality function and does not handle some cases correctly:
+    - If both values are non-floats, then it will always return 1.0,
+      regardless of whether strings match.
+    - If either of both values have different units (e.g. currency symbols,
+      trailing "M" or "B", trailing %), the values will not be converted to the same
+      units before comparison.
+    """
+    try:
+        ss1 = _strip_string(str1)
+        ss2 = _strip_string(str2)
+        if ss1 is None or ss2 is None:
+            hlog("WARNING: float_equiv returning 1.0 because both values are non-floats")
+            return 0.0
+        return float(abs(ss1 - ss2) < eps)
+    except Exception:
+        return float(str1 == str2)
+class ConvFinQACalcMetric(Metric):
+    """Score metrics for AIRBench 2024."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_answer = request_state.result.completions[0].text
+        assert len(request_state.instance.references) == 1
+        assert len(request_state.instance.references[0].tags) == 1
+        assert request_state.instance.references[0].tags[0] == CORRECT_TAG
+        gold_answer = request_state.instance.references[0].output.text
+        return [
+            Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
+        ]

helm/benchmark/metrics/copyright_metrics.py CHANGED Viewed

@@ -9,10 +9,10 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.scenarios.scenario import Reference
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import RequestResult
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 try:
     import numba

helm/benchmark/metrics/czech_bank_qa_metrics.py ADDED Viewed

@@ -0,0 +1,29 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class CzechBankQAMetrics(Metric):
+    """Score metrics for AIRBench 2024."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        # assert len(request_state.instance.references) > 1
+        # category_text = request_state.instance.references[0].output.text
+        # category_parts = category_text.split(".")
+        # assert len(category_parts) == 3
+        assert request_state.annotations
+        error_rate = 0.0 if request_state.annotations["czech_bank_qa"]["error"] is None else 1.0
+        return [
+            Stat(MetricName("error_rate")).add(error_rate),
+        ]

helm/benchmark/metrics/decodingtrust_fairness_metrics.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from typing import List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
-from .metric_name import MetricName
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
 import numpy as np
 from fairlearn.metrics import equalized_odds_difference

helm/benchmark/metrics/decodingtrust_privacy_metrics.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
-from .metric_name import MetricName
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
 from typing import List
 import re

helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py CHANGED Viewed

@@ -3,8 +3,8 @@ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMe
 from helm.common.request import GeneratedOutput
 from helm.common.request import RequestResult
 from helm.benchmark.adaptation.request_state import RequestState
-from .metric_name import MetricName
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
 import numpy as np

helm/benchmark/metrics/dischargeme_metrics.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, Dict, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class DischargeMeMetric(Metric):
+    """Score metrics for DischargeMe."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotations: Dict[str, Any] = request_state.annotations["dischargeme"]
+        scores: List[int] = []
+        score = 0.0
+        for annotation_key, annotation_dict in annotations.items():
+            if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
+                for val in annotation_dict.values():
+                    scores.append(int(val["score"]))
+        if scores:
+            score = sum(scores) / len(scores)
+        return [
+            Stat(MetricName("dischargeme_accuracy")).add(score),
+        ]

helm/benchmark/metrics/disinformation_metrics.py CHANGED Viewed

@@ -11,10 +11,10 @@ from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import RequestResult, GeneratedOutput
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 try:
     from sacrebleu.metrics import BLEU

helm/benchmark/metrics/dry_run_metrics.py CHANGED Viewed

@@ -8,11 +8,11 @@ from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.statistic import Stat, merge_stat
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
-from .metric import MetricInterface, MetricResult, PerInstanceStats
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .tokens.auto_token_cost_estimator import AutoTokenCostEstimator
-from .tokens.token_cost_estimator import TokenCostEstimator
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.tokens.auto_token_cost_estimator import AutoTokenCostEstimator
+from helm.benchmark.metrics.tokens.token_cost_estimator import TokenCostEstimator
 @dataclass

helm/benchmark/metrics/efficiency_metrics.py CHANGED Viewed

@@ -9,9 +9,9 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 EFFICIENCY_DATA_PACKAGE: str = "helm.benchmark.efficiency_data"

helm/benchmark/metrics/ehr_sql_metrics.py ADDED Viewed

@@ -0,0 +1,103 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hlog
+class EhrSqlMetric(Metric):
+    """
+    Metric for evaluating the EHR SQL dataset, focusing on:
+    1. Execution Accuracy – Whether the generated SQL query produces the same results as the ground truth.
+    2. Query Validity – Whether the generated SQL executes without errors.
+    3. Precision for Answerable Questions (Pans).
+    4. Recall for Answerable Questions (Rans).
+    """
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate execution accuracy, query validity, and answerability metrics.
+        """
+        if not request_state.annotations:
+            hlog(f"Warning: Request state missing annotations for instance {request_state.instance}")
+            return []
+        if "ehr_sql" not in request_state.annotations:
+            hlog(f"Warning: 'ehr_sql' key missing in annotations for instance {request_state.instance}")
+            return []
+        # Extract execution results
+        predicted_result = request_state.annotations["ehr_sql"].get("predicted_result", [])
+        ground_truth_result = request_state.annotations["ehr_sql"].get("ground_truth_result", [])
+        query_error = request_state.annotations["ehr_sql"].get("query_error", None)
+        # Extract predictions from the model output
+        if request_state.result is None:
+            predictions = []
+        else:
+            predictions = [completion.text.strip() for completion in request_state.result.completions]
+        if not predictions:
+            hlog(f"Warning: No predictions found in the completions for instance {request_state.instance}")
+            return []
+        # Process the first prediction as the primary output
+        prediction = predictions[0].strip()
+        # Extract references and input text
+        references = getattr(request_state.instance, "references", None)
+        if not references or len(references) == 0:
+            hlog(f"Warning: Missing references for instance {request_state.instance}")
+            return []
+        # Check if the ground truth is answerable based on `is_impossible` flag
+        ground_truth_query = references[0].output.text.strip() if references else None
+        is_impossible = (
+            request_state.instance.extra_data.get("is_impossible", False)
+            if request_state.instance.extra_data
+            else False
+        )
+        is_answerable = not is_impossible and bool(ground_truth_query)  # True if the ground truth is answerable
+        is_predicted_answerable = bool(prediction)  # True if the model generated a non-empty SQL query
+        correct_answerable = int(is_answerable and is_predicted_answerable)  # Correct if both are answerable
+        # **Execution Accuracy Fix:**
+        execution_accuracy = 0
+        if ground_truth_query:
+            if ground_truth_result and predicted_result:
+                execution_accuracy = int(set(predicted_result) == set(ground_truth_result))  # Compare sets.
+            elif not ground_truth_result and not predicted_result and not prediction:
+                execution_accuracy = 1  # Both empty and no query was generated.
+        elif not ground_truth_query and prediction:
+            execution_accuracy = 0  # LLM generated a query when no gold query exists.
+        # **Query Validity Fix:**
+        if not prediction:  # No SQL query was generated
+            query_validity = 0
+        elif query_error is None:
+            query_validity = 1  # Query executed successfully.
+        else:
+            query_validity = 0  # Execution error occurred.
+        return [
+            # Execution-based Metrics
+            Stat(MetricName("ehr_sql_execution_accuracy")).add(execution_accuracy),
+            Stat(MetricName("ehr_sql_query_validity")).add(query_validity),
+            # Answerability Metrics
+            Stat(MetricName("ehr_sql_precision_answerable")).add(correct_answerable if is_predicted_answerable else 0),
+            Stat(MetricName("ehr_sql_recall_answerable")).add(correct_answerable if is_answerable else 0),
+            Stat(MetricName("ehr_sql_total_predicted_answerable")).add(int(is_predicted_answerable)),
+            Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(int(is_answerable)),
+        ]

helm/benchmark/metrics/evaluate_instances_metric.py CHANGED Viewed

@@ -6,9 +6,9 @@ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, add_con
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.adaptation.request_state import RequestState
-from .metric_name import MetricName, MetricContext
-from .metric_service import MetricService
-from .statistic import Stat, merge_stat
+from helm.benchmark.metrics.metric_name import MetricName, MetricContext
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat, merge_stat
 class EvaluateInstancesMetric(MetricInterface, ABC):

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl