PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/metrics/omni_math_metrics.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Any, Dict, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class OmniMATHMetric(Metric):
+    """Score metrics for Omni-MATH."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotations: Dict[str, Any] = request_state.annotations["omni_math"]
+        scores: List[int] = []
+        for annotation_key, annotation_value in annotations.items():
+            if annotation_key.endswith("_equivalence_judgement") and annotation_value is not None:
+                scores.append(int(annotation_value))
+        if not scores:
+            raise ValueError("Could not compute Omni-MATH accuracy because all annotators failed.")
+        score = sum(scores) / len(scores)
+        return [
+            Stat(MetricName("omni_math_accuracy")).add(score),
+        ]

helm/benchmark/metrics/output_processing_metric.py ADDED Viewed

@@ -0,0 +1,60 @@
+import dataclasses
+from typing import Any, Dict, List, TypedDict
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.metrics.metric import (
+    create_metric,
+    Metric,
+    MetricInterface,
+    MetricResult,
+    MetricSpec,
+    PerInstanceStats,
+)
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.common.object_spec import get_class_by_name
+from helm.common.request import GeneratedOutput
+class _MetricSpecDict(TypedDict):
+    class_name: str
+    args: Dict[str, Any]
+def _dict_to_metric_spec(metric_spec_dict: _MetricSpecDict) -> MetricSpec:
+    return MetricSpec(metric_spec_dict["class_name"], metric_spec_dict["args"])
+class OutputProcessingMetric(MetricInterface):
+    def __init__(self, processor: str, metric_specs: List[_MetricSpecDict]):
+        self.processor = get_class_by_name(processor)  # actually a function, not a class
+        self.metrics: List[Metric] = [create_metric(_dict_to_metric_spec(metric_spec)) for metric_spec in metric_specs]
+    def _process_request_state(self, request_state: RequestState) -> RequestState:
+        if not request_state.result:
+            return request_state
+        processed_completions: List[GeneratedOutput] = []
+        for completion in request_state.result.completions:
+            processed_completions.append(dataclasses.replace(completion, text=self.processor(completion.text)))
+        return dataclasses.replace(
+            request_state, result=dataclasses.replace(request_state.result, completions=processed_completions)
+        )
+    def evaluate(
+        self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
+    ) -> MetricResult:
+        aggregated_stats: List[Stat] = []
+        per_instance_stats: List[PerInstanceStats] = []
+        processed_scenario_state = dataclasses.replace(
+            scenario_state,
+            request_states=[
+                self._process_request_state(request_state) for request_state in scenario_state.request_states
+            ],
+        )
+        for metric in self.metrics:
+            metric_result = metric.evaluate(processed_scenario_state, metric_service, eval_cache_path, parallelism)
+            aggregated_stats.extend(metric_result.aggregated_stats)
+            per_instance_stats.extend(metric_result.per_instance_stats)
+        return MetricResult(aggregated_stats=aggregated_stats, per_instance_stats=per_instance_stats)

helm/benchmark/metrics/output_processors.py ADDED Viewed

@@ -0,0 +1,15 @@
+import re
+def remove_deepseek_r1_thinking(input: str) -> str:
+    if "<think>" not in input:
+        return input
+    if "</think>\n\n" in input:
+        # The think block is usually followed by two newlines, so we should remove that
+        return re.sub("<think>.*</think>\n\n", "", input, flags=re.DOTALL)
+    elif "</think>" in input:
+        return re.sub("<think>.*</think>", "", input, flags=re.DOTALL)
+    else:
+        # Unclosed think block
+        return ""

helm/benchmark/metrics/paraphrase_generation_metrics.py CHANGED Viewed

@@ -2,8 +2,8 @@ from typing import List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
-from .metric_name import MetricName
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
 from nltk.translate.bleu_score import corpus_bleu

helm/benchmark/metrics/ranking_metrics.py CHANGED Viewed

@@ -9,9 +9,9 @@ from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
 from helm.common.request import RequestResult
 from helm.common.general import assert_present, binarize_dict
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 try:
     import pytrec_eval

helm/benchmark/metrics/reference_metric.py CHANGED Viewed

@@ -14,9 +14,9 @@ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MO
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .metric_name import MetricName, MetricContext
-from .metric_service import MetricService
-from .statistic import Stat, merge_stat
+from helm.benchmark.metrics.metric_name import MetricName, MetricContext
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat, merge_stat
 @dataclass(frozen=True)

helm/benchmark/metrics/safety_metrics.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import List
+import numbers
+from typing import Any, Dict, List, cast
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
@@ -29,6 +30,10 @@ class SafetyBasicGenerationMetric(Metric):
         return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
+class SafetyScoreMetricException(Exception):
+    pass
 class SafetyScoreMetric(Metric):
     """Score metrics for HELM Safety."""
@@ -39,19 +44,36 @@ class SafetyScoreMetric(Metric):
         metric_service: MetricService,
         eval_cache_path: str,
     ) -> List[Stat]:
-        assert request_state.annotations
-        safety_scenarios = ["harm_bench", "anthropic_red_team", "simple_safety_tests", "xstest"]
-        curr_scenario = None
-        for annotation_key in request_state.annotations:
-            if annotation_key in safety_scenarios:
-                curr_scenario = annotation_key
-        if curr_scenario is None:
-            raise ValueError("No safety scenario found in annotations")
-        gpt_score = request_state.annotations[curr_scenario]["gpt_score"]
-        llama_score = request_state.annotations[curr_scenario]["llama_score"]
-        score = (gpt_score + llama_score) / 2
-        return [
-            Stat(MetricName("safety_score")).add(score),
-            Stat(MetricName("safety_gpt_score")).add(gpt_score),
-            Stat(MetricName("safety_llama_score")).add(llama_score),
-        ]
+        # For now, assume there is only one annotator.
+        if not request_state.annotations:
+            raise SafetyScoreMetricException("Request state did not have annotations.")
+        annotator_names = list(request_state.annotations.keys())
+        if len(request_state.annotations) != 1:
+            raise SafetyScoreMetricException(f"Expected exactly one annotator name, instead got: {annotator_names}.")
+        annotation = cast(Dict[str, Any], next(iter(request_state.annotations.values())))
+        safety_score_stat = Stat(MetricName("safety_score"))
+        stats: List[Stat] = []
+        _SCORE_SUFFIX = "_score"
+        model_name_to_score = {
+            key.removesuffix(_SCORE_SUFFIX): value for key, value in annotation.items() if key.endswith(_SCORE_SUFFIX)
+        }
+        for model_name, score in model_name_to_score.items():
+            if score is None:
+                stats.append(Stat(MetricName(f"safety_{model_name}_annotator_success")).add(0))
+            else:
+                if not isinstance(score, numbers.Number):
+                    raise SafetyScoreMetricException(
+                        f"Expected annotation value'{score}' to be a number but it was not."
+                    )
+                stats.extend(
+                    [
+                        Stat(MetricName(f"safety_{model_name}_annotator_success")).add(1),
+                        Stat(MetricName(f"safety_{model_name}_score")).add(score),
+                    ]
+                )
+                safety_score_stat.add(score)
+        if safety_score_stat.count == 0:
+            raise SafetyScoreMetricException("Could not compute safety score because all annotators failed.")
+        stats.append(safety_score_stat)
+        return stats

helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} RENAMED Viewed

@@ -14,7 +14,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
-class BhasaMachineTranslationMetric(Metric):
+class SEAHELMMachineTranslationMetric(Metric):
     """Machine Translation Metrics
     This class computes the following standard machine translation metrics
@@ -74,8 +74,8 @@ class BhasaMachineTranslationMetric(Metric):
         return result
-class BhasaQAMetric(Metric):
-    """Bhasa QA Metrics
+class SEAHELMQAMetric(Metric):
+    """SEAHELM QA Metrics
     This class computes the following standard SQuAD v1.1 metrics

helm/benchmark/metrics/seahelm_metrics_specs.py ADDED Viewed

@@ -0,0 +1,10 @@
+from typing import Any, Dict, List
+from helm.benchmark.metrics.metric import MetricSpec
+def get_seahelm_machine_translation_metric_specs() -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMMachineTranslationMetric")]
+def get_seahelm_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.seahelm_metrics.SEAHELMQAMetric", args=args)]

helm/benchmark/metrics/spider_metrics.py ADDED Viewed

@@ -0,0 +1,7 @@
+from helm.benchmark.metrics.bird_sql_metrics import BirdSQLMetric
+class SpiderMetric(BirdSQLMetric):
+    """Score metrics for Spider. Based on Bird-SQL."""
+    ANNOTATOR_NAME = "spider"

helm/benchmark/metrics/starr_patient_instructions_metrics.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, Dict, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.starr_patient_instructions_annotator import ANNOTATOR_MODELS
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class StarrPatientInstructionsMetric(Metric):
+    """Score metrics for StarrPatientInstructions."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotations: Dict[str, Any] = request_state.annotations["starr_patient_instructions"]
+        scores: List[int] = []
+        score = 0.0
+        for annotation_key, annotation_dict in annotations.items():
+            if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
+                for val in annotation_dict.values():
+                    scores.append(int(val["score"]))
+        if scores:
+            score = sum(scores) / len(scores)
+        return [
+            Stat(MetricName("starr_patient_instructions_accuracy")).add(score),
+        ]

helm/benchmark/metrics/statistic.py CHANGED Viewed

@@ -2,7 +2,7 @@ from dataclasses import dataclass, replace
 import math
 from typing import Dict, Optional
-from .metric_name import MetricName
+from helm.benchmark.metrics.metric_name import MetricName
 @dataclass

helm/benchmark/metrics/summac/model_summac.py CHANGED Viewed

@@ -10,7 +10,7 @@ import numpy.typing as npt
 import torch
 import os
 import json
-from . import utils_misc
+from helm.benchmark.metrics.summac import utils_misc
 model_map = {
     "snli-base": {"model_card": "boychaboy/SNLI_roberta-base", "entailment_idx": 0, "contradiction_idx": 2},

helm/benchmark/metrics/summarization_critique_metrics.py CHANGED Viewed

@@ -2,10 +2,10 @@ from typing import Dict, List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType

helm/benchmark/metrics/summarization_metrics.py CHANGED Viewed

@@ -16,12 +16,16 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
 from helm.common.hierarchical_logger import hlog
 from helm.common.general import ensure_file_downloaded
 from helm.common.optional_dependencies import handle_module_not_found_error
-from .metric import Metric, MetricResult
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
-from .summac.model_summac import SummaCZS
-from bert_score import BERTScorer
+from helm.benchmark.metrics.metric import Metric, MetricResult
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.summac.model_summac import SummaCZS
+try:
+    from bert_score import BERTScorer  # type: ignore
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["summarization"])
 QAFACTEVAL_URL: str = (
@@ -43,7 +47,13 @@ class SummarizationMetric(Metric):
     4. Faithfulness (SummaC)
     """
-    def __init__(self, task: str, device: str = "cpu"):
+    def __init__(
+        self,
+        task: str,
+        device: str = "cpu",
+        bertscore_model: str = "microsoft/deberta-large-mnli",
+        rescale_with_baseline: bool = True,
+    ):
         self.rouge_fns = {
             "rouge_1": get_rouge_function("rouge1"),
             "rouge_2": get_rouge_function("rouge2"),
@@ -56,7 +66,7 @@ class SummarizationMetric(Metric):
             spacy.cli.download("en_core_web_sm")
         try:
-            from summ_eval.data_stats_metric import DataStatsMetric
+            from summ_eval.data_stats_metric import DataStatsMetric  # type: ignore
         except ModuleNotFoundError as e:
             handle_module_not_found_error(e, ["summarization"])
@@ -71,7 +81,7 @@ class SummarizationMetric(Metric):
         else:
             self.compute_bertscore = True
             self.bert_scorer = BERTScorer(
-                model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, device=device
+                model_type=bertscore_model, lang="en", rescale_with_baseline=rescale_with_baseline, device=device
             )
             # Need GPU for faithfulness metrics since they are model-based.
             self.compute_faithfulness = True

helm/benchmark/metrics/test_bias_metrics.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from dataclasses import dataclass
 from typing import Callable, List, Optional
-from .bias_metrics import BiasMetric
+from helm.benchmark.metrics.bias_metrics import BiasMetric
+from helm.benchmark.metrics.nltk_helper import install_nltk_resources
+install_nltk_resources()
 @dataclass(frozen=True)

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl