PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/metrics/evaluate_reference_metrics.py CHANGED Viewed

@@ -1,32 +1,31 @@
 from dataclasses import replace
+from functools import partial
 from typing import Callable, Dict, List, Optional, Set, Tuple, cast
+import re
+import string
+from nltk.metrics.scores import f_measure
+from nltk.tokenize import word_tokenize
+from nltk.translate.bleu_score import sentence_bleu
+from rouge_score import rouge_scorer
 import numpy as np
-from functools import partial
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics import code_metrics_helper
 from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.nltk_helper import install_nltk_resources
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.code_scenario import CodeReference
+from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
 from helm.benchmark.scenarios.scenario import Reference
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import GeneratedOutput
-from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
-from nltk.metrics.scores import f_measure
-from nltk.translate.bleu_score import sentence_bleu
-from nltk.tokenize import word_tokenize
-from rouge_score import rouge_scorer
-import re
-import string
-from . import code_metrics_helper
-import nltk
-try:
-    nltk.data.find("tokenizers/punkt")
-except LookupError:
-    nltk.download("punkt")  # Required for rouge
+install_nltk_resources()
 def pass_at_k_estimator(n: int, c: int, k: int) -> float:
@@ -40,7 +39,7 @@ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
     return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
-def normalize_text(text: str) -> str:
+def normalize_text(text: str, should_remove_articles: bool = True) -> str:
     """Lower text and remove punctuation, articles and extra whitespace.
     Copied from the [QuAC](http://quac.ai/) evaluation script found at
     https://s3.amazonaws.com/my89public/quac/scorer.py"""
@@ -58,7 +57,10 @@ def normalize_text(text: str) -> str:
     def lower(text: str) -> str:
         return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(text))))
+    normalized_text = remove_punc(lower(text))
+    if should_remove_articles:
+        normalized_text = remove_articles(normalized_text)
+    return white_space_fix(normalized_text)
 def exact_match(gold: str, pred: str) -> float:
@@ -75,6 +77,17 @@ def quasi_exact_match(gold: str, pred: str) -> float:
     return 1 if normalize_text(gold) == normalize_text(pred) else 0
+def quasi_leave_articles_exact_match(gold: str, pred: str) -> float:
+    if not pred:
+        return 0
+    return (
+        1
+        if normalize_text(gold, should_remove_articles=False) == normalize_text(pred, should_remove_articles=False)
+        else 0
+    )
 def prefix_exact_match(gold: str, pred: str) -> float:
     """
     The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
@@ -203,6 +216,112 @@ def cider(gold: str, pred: str) -> float:
     return average_score
+def wer_score(gold: str, pred: str) -> float:
+    # Word Error Rate (WER), which is a common
+    # metric used to evaluate the accuracy of speech recognition systems.
+    # The lower the better. The WER might be greater than 1.
+    # https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
+    try:
+        from jiwer import wer
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    if not pred:
+        return 0
+    gold = normalize_text(gold, should_remove_articles=False)
+    pred = normalize_text(pred, should_remove_articles=False)
+    wer_ret = wer(gold, pred)
+    return wer_ret
+def mer_score(gold: str, pred: str) -> float:
+    # Match Error Rate (MER), which is for evaluating the error rate of
+    # speech recognition systems. The lower the better.
+    try:
+        from jiwer import mer
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    if not pred:
+        return 0
+    gold = normalize_text(gold, should_remove_articles=False)
+    pred = normalize_text(pred, should_remove_articles=False)
+    mer_ret = mer(gold, pred)
+    return mer_ret
+def wip_score(gold: str, pred: str) -> float:
+    # Word information preservation (WIP) for evaluating the preserved information of speech
+    # recognition systems. The higher the better.
+    try:
+        from jiwer import wip
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    if not pred:
+        return 0
+    gold = normalize_text(gold, should_remove_articles=False)
+    pred = normalize_text(pred, should_remove_articles=False)
+    wip_ret = wip(gold, pred)
+    return wip_ret
+def cer_score(gold: str, pred: str) -> float:
+    # Character Error Rate (CER) for evaluating the accuracy
+    # of speech recognition systems. The lower the better.
+    try:
+        from jiwer import cer
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    if not pred:
+        return 0
+    gold = normalize_text(gold, should_remove_articles=False)
+    pred = normalize_text(pred, should_remove_articles=False)
+    cer_ret = cer(gold, pred)
+    assert isinstance(cer_ret, float)
+    return cer_ret
+def chinese_wer_score(gold: str, pred: str) -> float:
+    try:
+        import jieba
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    return wer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
+def chinese_mer_score(gold: str, pred: str) -> float:
+    try:
+        import jieba
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    return mer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
+def chinese_wip_score(gold: str, pred: str) -> float:
+    try:
+        import jieba
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    return wip_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
+def chinese_cer_score(gold: str, pred: str) -> float:
+    try:
+        import jieba
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["audiolm"])
+    return cer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
 def extract_set_from_text(
     set_str: str,
     set_start_str: str = " is ",
@@ -329,6 +448,7 @@ def compute_reference_metrics(
     metric_fn_mapping: Dict[str, Callable] = {
         "exact_match": exact_match,
         "quasi_exact_match": quasi_exact_match,
+        "quasi_leave_articles_exact_match": quasi_leave_articles_exact_match,
         "prefix_exact_match": prefix_exact_match,
         "quasi_prefix_exact_match": quasi_prefix_exact_match,
         "exact_match_indicator": exact_match_indicator,
@@ -352,6 +472,14 @@ def compute_reference_metrics(
         "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
         "cleva_math_result_match": cleva_math_result_match,
         "absolute_value_difference": absolute_value_difference,
+        "wer_score": wer_score,
+        "mer_score": mer_score,
+        "wip_score": wip_score,
+        "cer_score": cer_score,
+        "chinese_wer_score": chinese_wer_score,
+        "chinese_mer_score": chinese_mer_score,
+        "chinese_wip_score": chinese_wip_score,
+        "chinese_cer_score": chinese_cer_score,
     }
     stats: List[Stat] = []

helm/benchmark/metrics/gpqa_chain_of_thought_metric.py ADDED Viewed

@@ -0,0 +1,103 @@
+import re
+from typing import List, Optional
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+def extract_answer(output_text: str) -> Optional[str]:
+    """
+    Extracts the answer from the output text using two exact regex patterns.
+    Returns None if no valid answer is found.
+    Args:
+        output_text (str): The text from which to extract the answer.
+    Returns:
+        Optional[str]: The extracted answer (A-J) if found, otherwise None.
+    """
+    # First regex: Matches "answer is (A-J)" with optional parentheses
+    match = re.search(r"answer is \(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+    # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
+    match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+    # Third regex: Matches "answer is (A-J)" with optional leading non-alpha characters
+    match = re.search(r"correct answer is [^A-Za-z]*([A-J])", output_text)
+    if match:
+        return match.group(1)
+    # Fourth regex: Matches "answer is (A-J)" with optional leading non-capital alpha characters
+    match = re.search(r"correct answer is [^A-Z]*([A-J])", output_text)
+    if match:
+        return match.group(1)
+    # If no regex matches, return None
+    return None
+class GPQAChainOfThoughtMetric(Metric):
+    """
+    This metric focuses on structured reasoning and the accuracy of extracted answers.
+    It compares model outputs against correct answers provided in a multiple-choice
+    format and returns a score indicating the correctness of the generated response.
+    """
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate the generated output for chain-of-thought reasoning accuracy.
+        The method extracts the model's output, determines the correct answer
+        from the provided references, and compares the two to compute a binary score.
+        Args:
+            adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
+            request_state (RequestState): The state of the current request, including
+                                          the input instance, output results, and references.
+            metric_service (MetricService): A service used to compute metrics if needed.
+            eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
+        Returns:
+            List[Stat]: A list containing a single `Stat` object with the correctness
+                        score (1 for correct, 0 for incorrect) under the metric
+                        name "chain_of_thought_correct".
+        """
+        # Assert that completions exist if the result is not None
+        assert (
+            request_state.result is not None and request_state.result.completions
+        ), "Request state result must have completions."
+        # Set output_text if the assertion passes
+        output_text = request_state.result.completions[0].text
+        # Extract the answer using the updated logic
+        extracted_answer = extract_answer(output_text)
+        # Find the correct answer from references by translating index to letter
+        correct_answer = None
+        for index, option in enumerate(request_state.instance.references):
+            if option.is_correct:
+                correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
+                break
+        # Raise an exception if no correct answer is found
+        if correct_answer is None:
+            raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
+        # Compare extracted answer with the correct answer and compute the score
+        score = 1 if extracted_answer == correct_answer else 0
+        return [Stat(MetricName("chain_of_thought_correctness")).add(score)]

helm/benchmark/metrics/gpt4_audio_critique_metrics.py ADDED Viewed

@@ -0,0 +1,167 @@
+from typing import Dict, List
+import re
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
+from helm.benchmark.metrics.metric_name import MetricContext, MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat, merge_stat
+from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
+from helm.common.hierarchical_logger import hlog
+from helm.common.request import RequestResult, Request, GeneratedOutput
+class GPT4AudioCritiqueMetric(MetricInterface):
+    """
+    Critique evaluation for evaluating how original the generated text are given the image by GPT4.
+    """
+    # We can add more evaluation aspects here
+    METRIC_NAME: str = "gpt4_audio_critique"
+    GPT4_AUDIO_CRITIQUE_PROMPT: str = """###Task Description:
+A ground truth answer, a response from a model to evaluate, and a score rubric \
+representing a evaluation criteria are given.
+1. Write a one-sentence feedback that assess the quality of the response \
+strictly based on the given score rubric, not evaluating in general.
+2. After writing the one-sentence feedback, write a score that is an \
+integer between 1 and 5. You should refer to the score rubric.
+3. Please do not generate any other opening, closing, and explanations.
+###The ground truth answer:
+{{ground_truth}}
+###Model Response to evaluate:
+{{orig_response}}
+###Score Rubrics:
+[Does the predicted response align with the ground truth \
+in terms of accuracy?]
+Score 1: The response is completely inaccurate or unrelated to the ground truth.
+Score 2: The response contains significant inaccuracies or misinterpretations \
+that distort the meaning of the ground truth.
+Score 3: The response is mostly accurate but includes minor errors, omissions, \
+or ambiguities.
+Score 4: The response is accurate and aligns well with the ground truth, with \
+only slight room for improvement.
+Score 5: The response is fully accurate and precisely matches the ground truth \
+with no errors or misinterpretations.
+Your response should be in the format:
+###Short Explanation: (explanation in only one sentence)
+###Rating: (int)
+"""
+    def __init__(self, num_respondents: int, max_tokens: int):
+        self._num_respondents = num_respondents
+        self._max_tokens = max_tokens
+    def __repr__(self) -> str:
+        return "GPT4AudioCritiqueMetric()"
+    def _extract_score_from_gpt_output(self, evaluator_response: str):
+        """
+        Extract the score from the evaluator response. Refer to the official Vibe-Eval implementation:
+        https://github.com/reka-ai/reka-vibe-eval/blob/3852d4712da172a7b85dddeffc4f9c3482a6f4c9/evaluate.py#L159-#L164
+        """
+        re_match = re.search(r"Rating:\s*([1-5])", evaluator_response)
+        if re_match is None:
+            hlog(f"Error parsing answer: {evaluator_response}. Skipping question (and so the respondent entirely)")
+            return None
+        return int(re_match.group(1))
+    def evaluate(
+        self,
+        scenario_state: ScenarioState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+        parallelism: int,
+    ) -> MetricResult:
+        request_states: List[RequestState] = scenario_state.request_states
+        all_stats: Dict[MetricName, Stat] = {}
+        per_instance_stats: List[PerInstanceStats] = []
+        for request_state in request_states:
+            context = MetricContext.from_instance(request_state.instance)
+            stats_without_context = self.evaluate_generation(
+                scenario_state.adapter_spec,
+                request_state,
+                metric_service,
+                eval_cache_path,
+            )
+            stats = [add_context(stat_without_context, context) for stat_without_context in stats_without_context]
+            for stat in stats:
+                merge_stat(all_stats, stat)
+            assert request_state.instance.id is not None
+            per_instance_stats.append(
+                PerInstanceStats(
+                    instance_id=request_state.instance.id,
+                    perturbation=request_state.instance.perturbation,
+                    train_trial_index=request_state.train_trial_index,
+                    stats=stats,
+                )
+            )
+        return MetricResult(aggregated_stats=list(all_stats.values()), per_instance_stats=per_instance_stats)
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        input_request: Request = request_state.request
+        # Predicted outputs and their originality scores
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        # Get input image and generated response for the originality evaluation
+        assert input_request.multimodal_prompt is not None
+        completions: List[GeneratedOutput] = request_result.completions
+        generated_text: str = completions[0].text
+        ref_text: str = request_state.instance.references[0].output.text
+        template = CritiqueTaskTemplate(
+            name="ahelm_gpt4_audio_critique",
+            instructions=self.GPT4_AUDIO_CRITIQUE_PROMPT,
+            num_respondents=self._num_respondents,
+            max_tokens=self._max_tokens,
+            questions=[
+                CritiqueQuestionTemplate(
+                    name=self.METRIC_NAME,
+                    question_type=QuestionType.FREE_RESPONSE,
+                    text="",
+                    options=[],
+                    # media_object=image_objects[0],  # we only take the first image as input
+                )
+            ],
+        )
+        request = CritiqueRequest(
+            template=template,
+            fields={
+                "ground_truth": ref_text,
+                "orig_response": generated_text,
+            },
+        )
+        # send to critique request
+        result = metric_service.make_critique_request(request)
+        if not result or not result.responses:
+            # Skip computing metrics if there aren't any responses yet
+            hlog("Waiting for responses to be generated.")
+            return []
+        stats: Dict[str, Stat] = {}
+        for question in template.questions:
+            stats[question.name] = Stat(MetricName(question.name))
+        for response in result.responses:
+            for answer_name, answer in response.answers.items():
+                assert isinstance(answer, str)
+                answer_value: float
+                answer_value = self._extract_score_from_gpt_output(answer)
+                stats[answer_name].add(answer_value)
+        return list(stats.values())

helm/benchmark/metrics/helpdesk_call_summarization_metrics.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import Any, Dict, List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class HelpdeskCallSummarizationMetric(Metric):
+    """Score metrics for helpdesk call summarization."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        annotations: Dict[str, Any] = request_state.annotations["helpdesk_call_center_summarization"]
+        scores: List[int] = []
+        for annotation_key, annotation_value in annotations.items():
+            if annotation_key.endswith("_score") and annotation_value is not None:
+                scores.append(annotation_value)
+        if not scores:
+            raise ValueError(
+                "Could not compute score in HelpdeskCallSummarizationMetric because all annotators failed."
+            )
+        score = sum(scores) / len(scores)
+        # normalize score
+        score = (score - 1) / 9
+        return [
+            Stat(MetricName("call_summarization_score")).add(score),
+        ]

helm/benchmark/metrics/ifeval/__init__.py ADDED Viewed

File without changes

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl