PyPI - eval-framework - Versions diffs - 0.2.12__tar.gz → 0.2.13__tar.gz - Mend

eval-framework 0.2.12tar.gz → 0.2.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

{eval_framework-0.2.12 → eval_framework-0.2.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.2.12
+Version: 0.2.13
 Summary: Evalulation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
@@ -377,7 +377,7 @@ Subset of core capabilities benchmarks coverd by `eval-framework`:
 | **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
 |---------------|---------------|----------|------------|------------------------|------------------|
-| COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
+| COPA, BalancedCOPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
 | Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
 | Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |

{eval_framework-0.2.12 → eval_framework-0.2.13}/README.md RENAMED Viewed

@@ -106,7 +106,7 @@ Subset of core capabilities benchmarks coverd by `eval-framework`:
 | **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
 |---------------|---------------|----------|------------|------------------------|------------------|
-| COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
+| COPA, BalancedCOPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
 | Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
 | Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |

{eval_framework-0.2.12 → eval_framework-0.2.13}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "eval-framework"
-version = "0.2.12"
+version = "0.2.13"
 description = "Evalulation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -105,6 +105,7 @@ dev = [
   "types-requests>=2.32.0.20250328,<3",
   "plotly>=5.24.1,<6",
   "ruff>=0.12.8",
+  "scipy>=1.14.0,<2",  # for tests comparing our Hungarian implementation to scipy
 ]
 flash-attn = [
   "flash-attn>=2.7.2.post1,<2.8",
@@ -178,6 +179,7 @@ addopts = "-p 'no:legacypath' --doctest-modules"
 markers = [
     "gpu: needs a GPU runner, otherwise test can not be run",
     "cpu_slow: runs for a long time (on CPU)",
+    "slow_download: smoke tests that download large datasets (>15s); excluded from CI, run manually with -m slow_download",
     "external_api: needs external services for execution",
     "vllm: tests that specifically require vLLM functionality",
     "formatter_hash: formatter consistency tests using hash comparisons",

{eval_framework-0.2.12 → eval_framework-0.2.13}/src/eval_framework/evaluation_generator.py RENAMED Viewed

@@ -134,16 +134,27 @@ class EvaluationGenerator:
             # filter and count errors
             total_count = len(data_subset)
-            mask = data["error"].isnull()
+            mask = data_subset["error"].isnull()
             data_subset_error_free = data_subset.loc[mask, ["subject", "key", "value"]]
-            # data_subset_error_free = data_subset[data_subset["error"].isnull()][["subject", "key", "value"]]
-            aggregated_results[f"ErrorFreeRatio {metric}"] = float(len(data_subset_error_free) / total_count)
+            error_free_ratio = float(len(data_subset_error_free) / total_count)
+            aggregated_results[f"ErrorFreeRatio {metric}"] = error_free_ratio
             # aggregate by key and subject first to have equal weights for all key / subject combinations
             key_subject_mean = data_subset_error_free.groupby(["key", "subject"]).mean()
             aggregated_results[f"Average {metric}"] = float(key_subject_mean[["value"]].mean()["value"])
+            if error_free_ratio < 1.0:
+                # Treat error samples (with value=None) as 0 for the "including errors" average
+                data_subset_with_errors = data_subset[["key", "subject", "value", "error"]].copy()
+                # Only fill value with 0 where there's an error (not for all None values)
+                error_mask = data_subset_with_errors["error"].notna()
+                data_subset_with_errors.loc[error_mask, "value"] = data_subset_with_errors.loc[
+                    error_mask, "value"
+                ].fillna(0.0)
+                key_subject_mean_with_errors = data_subset_with_errors.groupby(["key", "subject"])["value"].mean()
+                aggregated_results[f"Average {metric} (including Errors)"] = float(key_subject_mean_with_errors.mean())
             std_err_mean_sum_of_squares = 0.0
             std_err_mean_total_num_samples = 0.0
             std_err_mean_num_subjects = 0
@@ -156,14 +167,29 @@ class EvaluationGenerator:
                         # group = data_subset[data[column] == name][["subject", "key", "value", "error"]]
                         group_total_count = len(group)
                         group_error_free = group[group["error"].isnull()][["subject", "key", "value"]]
-                        aggregated_results[f"ErrorFreeRatio {metric} - {name[0]}"] = float(
-                            len(group_error_free) / group_total_count
-                        )
+                        group_error_free_ratio = float(len(group_error_free) / group_total_count)
+                        aggregated_results[f"ErrorFreeRatio {metric} - {name[0]}"] = group_error_free_ratio
                         group_key_subject_mean = group_error_free.groupby(["key", "subject"]).mean()
                         value = float(group_key_subject_mean[["value"]].mean()["value"])
                         aggregated_results[f"Average {metric} - {name[0]}"] = value if not math.isnan(value) else None
+                        if group_error_free_ratio < 1.0:
+                            # Treat error samples (with value=None) as 0 for the "including errors" average
+                            group_with_errors = group[["key", "subject", "value", "error"]].copy()
+                            # Only fill value with 0 where there's an error (not for all None values)
+                            error_mask = group_with_errors["error"].notna()
+                            group_with_errors.loc[error_mask, "value"] = group_with_errors.loc[
+                                error_mask, "value"
+                            ].fillna(0.0)
+                            group_key_subject_mean_with_errors = group_with_errors.groupby(["key", "subject"])[
+                                "value"
+                            ].mean()
+                            value_with_errors = float(group_key_subject_mean_with_errors.mean())
+                            aggregated_results[f"Average {metric} (including Errors) - {name[0]}"] = (
+                                value_with_errors if not math.isnan(value_with_errors) else None
+                            )
                         if not ("SequencePositions" in metric or "Bytes" in metric):
                             # calculate standard error for selected  metrics
                             group_key_subject_std = group_error_free.groupby(["key", "subject"]).std()

eval_framework-0.2.13/src/eval_framework/external/drop_process_results.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""DROP F1 and exact match evaluation.
+Logic adapted from AllenNLP DROP evaluation:
+https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
+"""
+import re
+import string
+import numpy as np
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+def _linear_sum_assignment(cost_matrix: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """Solve the linear sum assignment problem (minimize cost) using the Hungarian algorithm.
+    Pure NumPy implementation to avoid scipy dependency. Returns (row_ind, col_ind) with row_ind sorted,
+    matching scipy.optimize.linear_sum_assignment API for rectangular matrices.
+    """
+    cost = np.atleast_2d(np.asarray(cost_matrix, dtype=np.float64))
+    n_rows, n_cols = cost.shape
+    n = max(n_rows, n_cols)
+    # Pad to square with zeros so unassigned rows/cols contribute zero cost
+    C = np.zeros((n, n))
+    C[:n_rows, :n_cols] = cost
+    # Row and column reductions
+    u = np.min(C, axis=1)
+    u[u == np.inf] = 0
+    C = C - u[:, np.newaxis]
+    v = np.min(C, axis=0)
+    v[v == np.inf] = 0
+    C = C - v[np.newaxis, :]
+    # Starred zeros: assignment (1 = starred). Start with no stars.
+    star = np.zeros((n, n), dtype=np.intp)
+    row_covered = np.zeros(n, dtype=bool)
+    col_covered = np.zeros(n, dtype=bool)
+    prime = np.zeros((n, n), dtype=np.intp)
+    def find_zero() -> tuple[int, int] | None:
+        for i in range(n):
+            if row_covered[i]:
+                continue
+            for j in range(n):
+                if not col_covered[j] and C[i, j] == 0:
+                    return (i, j)
+        return None
+    def star_in_row(i: int) -> int | None:
+        for j in range(n):
+            if star[i, j]:
+                return j
+        return None
+    def star_in_col(j: int) -> int | None:
+        for i in range(n):
+            if star[i, j]:
+                return i
+        return None
+    def prime_in_row(i: int) -> int | None:
+        for j in range(n):
+            if prime[i, j]:
+                return j
+        return None
+    while True:
+        # Cover columns containing a starred zero
+        col_covered[:] = False
+        for j in range(n):
+            for i in range(n):
+                if star[i, j]:
+                    col_covered[j] = True
+                    break
+        if np.all(col_covered):
+            break
+        prime[:] = 0
+        row_covered[:] = False
+        while True:
+            z = find_zero()
+            if z is None:
+                # No uncovered zero: find minimum uncovered value and adjust
+                min_val = np.inf
+                for i in range(n):
+                    if not row_covered[i]:
+                        for j in range(n):
+                            if not col_covered[j] and C[i, j] < min_val:
+                                min_val = C[i, j]
+                if min_val == np.inf or min_val <= 0:
+                    min_val = 1e-10
+                for i in range(n):
+                    if row_covered[i]:
+                        C[i, :] += min_val
+                for j in range(n):
+                    if not col_covered[j]:
+                        C[:, j] -= min_val
+                continue
+            i, j = z
+            prime[i, j] = 1
+            cj = star_in_row(i)
+            if cj is None:
+                # Augmenting path: unstar starred, star primed along path
+                path = [(i, j)]
+                while True:
+                    ji = star_in_col(path[-1][1])
+                    if ji is None:
+                        break
+                    path.append((ji, path[-1][1]))
+                    pj = prime_in_row(ji)
+                    if pj is None:
+                        break
+                    path.append((ji, pj))
+                for pi, pj in path:
+                    star[pi, pj] = 1 - star[pi, pj]
+                prime[:] = 0
+                row_covered[:] = False
+                col_covered[:] = False
+                break
+            row_covered[i] = True
+            col_covered[cj] = False
+    # Extract assignment: (row_ind, col_ind) for starred zeros, row_ind sorted
+    row_ind = np.array([i for i in range(n) for j in range(n) if star[i, j]], dtype=np.intp)
+    col_ind = np.array([j for i in range(n) for j in range(n) if star[i, j]], dtype=np.intp)
+    # Keep only assignments within original matrix
+    mask = (row_ind < n_rows) & (col_ind < n_cols)
+    row_ind = row_ind[mask]
+    col_ind = col_ind[mask]
+    # Sort by row index (scipy API)
+    perm = np.argsort(row_ind)
+    return row_ind[perm], col_ind[perm]
+def process_results(doc: dict, results: list) -> dict[str, float]:
+    """Compute DROP exact_match and F1 between predictions and gold answers.
+    doc["answers"] is a list of gold answers (each a tuple or list of strings).
+    results is a list of predicted answers (one per prediction; for one completion use [pred]).
+    """
+    preds, golds = results, doc["answers"]
+    max_em = 0.0
+    max_f1 = 0.0
+    for gold_answer in golds:
+        exact_match, f1_score = get_metrics(preds, gold_answer)
+        if gold_answer and (gold_answer[0].strip() if isinstance(gold_answer[0], str) else True):
+            max_em = max(max_em, exact_match)
+            max_f1 = max(max_f1, f1_score)
+    return {"exact_match": max_em, "f1": max_f1}
+def get_metrics(predicted: list | str, gold: tuple | list) -> tuple[float, float]:
+    """Return (exact_match, f1) for one gold answer. predicted and gold can be string or list of strings."""
+    predicted_bags = _answer_to_bags(predicted)
+    gold_bags = _answer_to_bags(gold)
+    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
+        exact_match = 1.0
+    else:
+        exact_match = 0.0
+    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+    f1 = float(np.mean(f1_per_bag))
+    f1 = round(f1, 2)
+    return exact_match, f1
+def _answer_to_bags(answer: list | tuple | str) -> tuple[list[str], list[set]]:
+    if isinstance(answer, list | tuple):
+        raw_spans = list(answer)
+    else:
+        raw_spans = [answer]
+    normalized_spans = []
+    token_bags = []
+    for raw_span in raw_spans:
+        normalized_span = _normalize(str(raw_span))
+        normalized_spans.append(normalized_span)
+        token_bags.append(set(normalized_span.split()))
+    return normalized_spans, token_bags
+def _align_bags(predicted: list[set], gold: list[set]) -> np.ndarray:
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            if _match_numbers_if_present(gold_item, pred_item):
+                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
+    row_ind, col_ind = _linear_sum_assignment(-scores)
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores
+def _compute_f1(predicted_bag: set, gold_bag: set) -> float:
+    intersection = len(gold_bag.intersection(predicted_bag))
+    if not predicted_bag:
+        precision = 1.0
+    else:
+        precision = intersection / float(len(predicted_bag))
+    if not gold_bag:
+        recall = 1.0
+    else:
+        recall = intersection / float(len(gold_bag))
+    return (2 * precision * recall) / (precision + recall) if (precision or recall) else 0.0
+def _match_numbers_if_present(gold_bag: set, predicted_bag: set) -> bool:
+    gold_numbers = {w for w in gold_bag if _is_number(w)}
+    predicted_numbers = {w for w in predicted_bag if _is_number(w)}
+    return (not gold_numbers) or bool(gold_numbers.intersection(predicted_numbers))
+def _is_number(text: str) -> bool:
+    try:
+        float(text)
+        return True
+    except ValueError:
+        return False
+def _remove_articles(text: str) -> str:
+    return _ARTICLES.sub(" ", text)
+def _white_space_fix(text: str) -> str:
+    return " ".join(text.split())
+def _remove_punc(text: str) -> str:
+    exclude = set(string.punctuation)
+    return "".join(ch for ch in text if ch not in exclude) if not _is_number(text) else text
+def _fix_number(text: str) -> str:
+    return str(float(text)) if _is_number(text) else text
+def _tokenize(text: str) -> list[str]:
+    return re.split(" |-", text)
+def _normalize(answer: str) -> str:
+    tokens = [
+        _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
+    ]
+    tokens = [t for t in tokens if t.strip()]
+    return " ".join(tokens).strip()

{eval_framework-0.2.12 → eval_framework-0.2.13}/src/eval_framework/llm/huggingface.py RENAMED Viewed

@@ -10,7 +10,14 @@ from typing import Any
 import torch
 from tokenizers import Tokenizer
-from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+from transformers.models.gpt2 import GPT2Tokenizer
+from transformers.tokenization_utils import PreTrainedTokenizerBase
 from eval_framework.llm.base import BaseLLM
 from eval_framework.shared.types import (
@@ -83,9 +90,13 @@ class BaseHFLLM(BaseLLM):
     SEQ_LENGTH: int | None = None
     BYTES_PER_TOKEN: float = 4.0  # rule of thumb according to https://platform.openai.com/tokenizer
+    def _load_tokenizer(self) -> PreTrainedTokenizerBase:
+        """Load the tokenizer. Override in subclasses to use a specific tokenizer class."""
+        return AutoTokenizer.from_pretrained(self.LLM_NAME)
     def __init__(self, formatter: BaseFormatter | None = None, bytes_per_token: float | None = None) -> None:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.tokenizer = AutoTokenizer.from_pretrained(self.LLM_NAME)
+        self.tokenizer = self._load_tokenizer()
         self.model = AutoModelForCausalLM.from_pretrained(self.LLM_NAME, device_map="auto")
         logger.info(f"{RED}[ Model initialized --------------------- {RESET}{self.LLM_NAME} {RED}]{RESET}")
         self._set_formatter(formatter)
@@ -403,14 +414,24 @@ class Pythia410m(HFLLM):
 class SmolLM135M(HFLLM):
+    """SmolLM-135M uses a GPT2-style tokenizer; AutoTokenizer can incorrectly select LlamaTokenizer."""
     LLM_NAME = "HuggingFaceTB/SmolLM-135M"
     DEFAULT_FORMATTER = ConcatFormatter
+    def _load_tokenizer(self) -> PreTrainedTokenizerBase:
+        return GPT2Tokenizer.from_pretrained(self.LLM_NAME)
 class Smollm135MInstruct(HFLLM):
+    """SmolLM-135M-Instruct uses a GPT2-style tokenizer; AutoTokenizer can incorrectly select LlamaTokenizer."""
     LLM_NAME = "HuggingFaceTB/SmolLM-135M-Instruct"
     DEFAULT_FORMATTER = partial(HFFormatter, LLM_NAME)
+    def _load_tokenizer(self) -> PreTrainedTokenizerBase:
+        return GPT2Tokenizer.from_pretrained(self.LLM_NAME)
 class Qwen3_0_6B(HFLLM):
     LLM_NAME = "Qwen/Qwen3-0.6B"

{eval_framework-0.2.12 → eval_framework-0.2.13}/src/eval_framework/llm/vllm.py RENAMED Viewed

@@ -134,11 +134,12 @@ class BaseVLLMModel(BaseLLM):
             **kwargs,
         }
-        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         self.batch_size = batch_size
-        self.model = LLM(**model_args, device=device)
+        if "VLLM_TARGET_DEVICE" not in os.environ and not torch.cuda.is_available():
+            os.environ["VLLM_TARGET_DEVICE"] = "cpu"
+        self.model = LLM(**model_args)
         self._tokenizer: None | VLLMTokenizerAPI = None
         _ = self.tokenizer  # make sure tokenizer is initialized

eval_framework-0.2.13/src/eval_framework/metrics/completion/drop_completion.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""DROP completion metrics: F1 and exact match."""
+from eval_framework.external.drop_process_results import process_results
+from eval_framework.metrics.base import BaseMetric, MetricResult
+from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
+class DropMetricContext(BaseMetricContext):
+    """Context for DROP completion metrics. answer_tuples: list of gold answers (each a list of strings)."""
+    answer_tuples: list[list[str]]
+class DropF1ExactMatch(BaseMetric[Completion]):
+    """DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""
+    NAME = "DROP F1 / Exact Match"
+    KEYS = ["f1", "exact_match"]
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [
+                MetricResult(metric_name=f"{self.NAME}/f1", value=None, higher_is_better=True, error=response.error),
+                MetricResult(
+                    metric_name=f"{self.NAME}/exact_match", value=None, higher_is_better=True, error=response.error
+                ),
+            ]
+        context = extract_context_metric(response, DropMetricContext)
+        # Gold: list of tuples (stored as list of lists)
+        answer_tuples = [list(a) for a in context.answer_tuples]
+        # Parse completion: comma-separated spans or single string
+        raw = (response.completion or "").strip()
+        pred_spans = [s.strip() for s in raw.split(",") if s.strip()] if raw else []
+        if not pred_spans:
+            pred_spans = [raw]
+        doc = {"answers": answer_tuples}
+        results = [pred_spans]
+        out = process_results(doc, results)
+        return [
+            MetricResult(metric_name="DROP F1", value=out["f1"], higher_is_better=True, error=response.error),
+            MetricResult(
+                metric_name="Exact Match", value=out["exact_match"], higher_is_better=True, error=response.error
+            ),
+        ]

eval_framework-0.2.13/src/eval_framework/metrics/completion/math_minerva_completion.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""
+Minerva-style MATH completion metric: exact_match and exact_match_flex.
+"""
+from eval_framework.metrics.base import BaseMetric, MetricResult
+from eval_framework.metrics.completion.minerva_math_utils import (
+    extract_answers,
+    is_equiv_hendrycks,
+    is_equiv_minerva,
+)
+from eval_framework.shared.types import Completion
+class MathMinervaCompletion(BaseMetric[Completion]):
+    """
+    Minerva MATH: reports Exact Match and Exact Match (Flex).
+    Uses raw_completion to extract multiple candidates; primary for exact_match,
+    all candidates with both Minerva and Hendrycks equivalence for exact_match_flex.
+    """
+    NAME = "Math Minerva Completion"
+    def __init__(
+        self,
+        use_cot: bool = True,
+        cot_style: str = "minerva",
+        relaxed: bool = False,
+    ) -> None:
+        self.use_cot = use_cot
+        self.cot_style = cot_style
+        self.relaxed = relaxed
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error:
+            return [
+                MetricResult(
+                    metric_name="Exact Match",
+                    value=None,
+                    higher_is_better=True,
+                    error=response.error,
+                ),
+                MetricResult(
+                    metric_name="Exact Match (Flex)",
+                    value=None,
+                    higher_is_better=True,
+                    error=response.error,
+                ),
+            ]
+        gold = response.ground_truth
+        if isinstance(gold, list):
+            gold = gold[0] if gold else None
+        if not gold:
+            return [
+                MetricResult(
+                    metric_name="Exact Match",
+                    value=None,
+                    higher_is_better=True,
+                    error="No ground truth available",
+                ),
+                MetricResult(
+                    metric_name="Exact Match (Flex)",
+                    value=None,
+                    higher_is_better=True,
+                    error="No ground truth available",
+                ),
+            ]
+        raw = response.raw_completion or response.completion
+        all_candidates = extract_answers(raw, use_cot=self.use_cot, cot_style=self.cot_style, relaxed=self.relaxed)
+        exact_match = 0.0
+        if all_candidates:
+            primary = all_candidates[0]
+            if is_equiv_minerva(primary, gold):
+                exact_match = 1.0
+        exact_match_flex = float(
+            any(
+                is_equiv_minerva(candidate, gold) or is_equiv_hendrycks(candidate, gold) for candidate in all_candidates
+            )
+        )
+        return [
+            MetricResult(metric_name="Exact Match", value=exact_match, higher_is_better=True),
+            MetricResult(
+                metric_name="Exact Match (Flex)",
+                value=exact_match_flex,
+                higher_is_better=True,
+            ),
+        ]
+class MathMinervaCompletionRelaxed(MathMinervaCompletion):
+    """MathMinervaCompletion with relaxed=True by default (flexible final-answer matching)."""
+    def __init__(
+        self,
+        use_cot: bool = True,
+        cot_style: str = "minerva",
+        relaxed: bool = True,
+    ) -> None:
+        super().__init__(use_cot=use_cot, cot_style=cot_style, relaxed=relaxed)

{eval_framework-0.2.12 → eval_framework-0.2.13}/src/eval_framework/metrics/completion/math_reasoning_completion.py RENAMED Viewed

@@ -8,6 +8,7 @@ from sympy.parsing.latex import parse_latex
 from sympy.parsing.latex.errors import LaTeXParsingError
 from eval_framework.metrics.base import BaseMetric, MetricResult
+from eval_framework.metrics.completion.minerva_math_utils import _normalize_latex_core
 from eval_framework.shared.types import Completion
@@ -110,14 +111,7 @@ class MathReasoningCompletion(BaseMetric[Completion]):
         for expr in self.REMOVED_EXPRESSIONS_FORMAT:
             # Safely remove formatting expressions
             final_answer = final_answer.replace(expr, "")
-        final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", r"$\3$", final_answer)
-        final_answer = re.sub(r"(\\text\{)(.*?)(\})", r"\2", final_answer)
-        final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", r"\2", final_answer)
-        final_answer = re.sub(r"(\\overline\{)(.*?)(\})", r"\2", final_answer)
-        final_answer = re.sub(r"(\\boxed\{)(.*)(\})", r"\2", final_answer)
-        final_answer = re.sub(r"(frac)([^{])(.)", r"frac{\2}{\3}", final_answer)
-        final_answer = re.sub(r"(sqrt)([^{])", r"sqrt{\2}", final_answer)
-        final_answer = final_answer.replace("$", "")
+        final_answer = _normalize_latex_core(final_answer)
         # Only strip commas if it's a single numeric value with optional commas (like "1,000")
         if re.fullmatch(r"\d{1,3}(,\d{3})*", final_answer):
             final_answer = final_answer.replace(",", "")

eval-framework 0.2.12__tar.gz → 0.2.13__tar.gz

eval-framework 0.2.12tar.gz → 0.2.13tar.gz