PyPI - eval-framework - Versions diffs - 0.2.14__tar.gz → 0.3.1__tar.gz - Mend

eval-framework 0.2.14tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

{eval_framework-0.2.14 → eval_framework-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.2.14
+Version: 0.3.1
 Summary: Evalulation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
@@ -228,7 +228,7 @@ Requires-Dist: jsonschema>=4.23.0,<5
 Requires-Dist: mysql-connector-python>=9.0.0,<10
 Requires-Dist: psycopg2-binary>=2.9.9,<3
 Requires-Dist: sympy>=1.13.1,<2
-Requires-Dist: llm-sandbox[docker]>=0.1.8,<0.2
+Requires-Dist: llm-sandbox[docker]==0.3.37
 Requires-Dist: jsonlines>=4,<5
 Requires-Dist: lxml>=6,<7
 Requires-Dist: python-iso639>=2025.2.18
@@ -236,6 +236,7 @@ Requires-Dist: wandb>=0.23.0,<1
 Requires-Dist: boto3>=1.40.54,<2
 Requires-Dist: numpy>=1.26.4
 Requires-Dist: antlr4-python3-runtime==4.11.0
+Requires-Dist: scipy>=1.14.0,<2
 Requires-Dist: accelerate ; extra == 'accelerate'
 Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
 Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'

{eval_framework-0.2.14 → eval_framework-0.3.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "eval-framework"
-version = "0.2.14"
+version = "0.3.1"
 description = "Evalulation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -35,7 +35,7 @@ dependencies = [
   "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
   "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
   "sympy>=1.13.1,<2",
-  "llm-sandbox[docker]>=0.1.8,<0.2",
+  "llm-sandbox[docker]==0.3.37",
   "jsonlines>=4,<5",
   "lxml>=6,<7",
   "python-iso639>=2025.2.18",
@@ -45,6 +45,8 @@ dependencies = [
   # is a dependency of sympy, but not explicitly listed in the requirements.txt
   # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
   "antlr4-python3-runtime==4.11.0",
+  "scipy>=1.14.0,<2",  # required for the aggregation of pass@k metrics
 ]
 [project.optional-dependencies]
@@ -105,7 +107,6 @@ dev = [
   "types-requests>=2.32.0.20250328,<3",
   "plotly>=5.24.1,<6",
   "ruff>=0.12.8",
-  "scipy>=1.14.0,<2",  # for tests comparing our Hungarian implementation to scipy
 ]
 flash-attn = [
   "flash-attn>=2.7.2.post1,<2.8",

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/determined.py RENAMED Viewed

@@ -148,6 +148,7 @@ class DeterminedContext(EvalContext):
             wandb_project=self.hparams.wandb_project or self.wandb_project,
             wandb_entity=self.hparams.wandb_entity or self.wandb_entity,
             wandb_run_id=self.hparams.wandb_run_id or self.wandb_run_id,
+            wandb_group=self.wandb_group,
             wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
             batch_size=self.hparams.task_args.batch_size or self.batch_size,
             description=self.hparams.description or self.description,

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/eval.py RENAMED Viewed

@@ -61,6 +61,7 @@ class EvalContext(AbstractContextManager):
         wandb_project: str | None = None,
         wandb_entity: str | None = None,
         wandb_run_id: str | None = None,
+        wandb_group: str | None = None,
         wandb_upload_results: bool | None = None,
         hf_upload_dir: str | None = None,
         hf_upload_repo: str | None = None,
@@ -89,6 +90,7 @@ class EvalContext(AbstractContextManager):
         self.wandb_project = wandb_project
         self.wandb_entity = wandb_entity
         self.wandb_run_id = wandb_run_id
+        self.wandb_group = wandb_group
         self.wandb_upload_results = wandb_upload_results
         self.hf_upload_dir = hf_upload_dir
         self.hf_upload_repo = hf_upload_repo

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/local.py RENAMED Viewed

@@ -58,6 +58,7 @@ class LocalContext(EvalContext):
             wandb_entity=self.wandb_entity,
             wandb_project=self.wandb_project,
             wandb_run_id=self.wandb_run_id,
+            wandb_group=self.wandb_group,
             wandb_upload_results=self.wandb_upload_results,
             llm_judge_class=self.llm_judge_class,
             judge_model_args=self.judge_model_args,

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/evaluation_generator.py RENAMED Viewed

@@ -37,10 +37,17 @@ class EvaluationGenerator:
         self.save_intermediate_results = config.save_intermediate_results
         task_class = get_task(config.task_name)
-        if task_class.RESPONSE_TYPE == ResponseType.COMPLETION:
-            self.metrics = task_class.METRICS + [BytesCompletion, SequencePositionsCompletion]
-        elif task_class.RESPONSE_TYPE == ResponseType.LOGLIKELIHOODS:
-            self.metrics = task_class.METRICS + [BytesLoglikelihood, SequencePositionsLoglikelihood]
+        if hasattr(task_class, "TASK_STYLER"):
+            response_type = task_class.TASK_STYLER.response_type
+            task_metrics = list(task_class.TASK_STYLER.metrics)
+        else:
+            response_type = task_class.RESPONSE_TYPE
+            task_metrics = task_class.METRICS
+        if response_type == ResponseType.COMPLETION:
+            self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
+        elif response_type == ResponseType.LOGLIKELIHOODS:
+            self.metrics = task_metrics + [BytesLoglikelihood, SequencePositionsLoglikelihood]
         else:
             raise NotImplementedError
@@ -243,6 +250,61 @@ class EvaluationGenerator:
         return aggregated_results
+    def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
+        data = pd.DataFrame([r.model_dump() for r in results])
+        if len(data) == 0:
+            return {}
+        data = data.fillna({"key": ""})
+        aggregated_results: dict[str, float | None] = {}
+        data = data.loc[data.error.isnull()]
+        for (metric_name, current_metric_class), metric_group in data.groupby(["metric_name", "metric_class_name"]):
+            # The reason we groupby over both metric_name and metric_class_name is because we want to aggregate
+            # results for a single metric. Two metric classes can implement the same metric name. We want to separate
+            # those cases. We cannot group over only metric_class_name because each metric class can implement
+            # multiple metrics with different names.
+            current_metric = None
+            # now loop over the self.metrics list and find the metric class that matches the current_metric_class
+            for metric_class in self.metrics:
+                if metric_class.__name__ == current_metric_class:
+                    current_metric = metric_class
+                    break
+            if current_metric is None:
+                raise ValueError(f"Metric {metric_name} not found in metrics list")
+            for aggregator in current_metric.AGGREGATORS:
+                aggregated_results[f"{aggregator.name} {current_metric_class}.{metric_name}"] = (
+                    aggregator(metric_group, ["prompt"])  # Compute the aggregator, grouped by the prompt...
+                    .groupby(["key", "subject"])  # ... then group by key, subject...
+                    .agg({"value": "mean"})["value"]  # ...and average scores over each key, subject group...
+                    .mean()  # ...and lastly average the scores across all groups giving equal weight to every
+                    .item()  # key, subject group.
+                )
+        # Loop to additionally compute per-subject/per-key breakdown metric scores, e.g. for only subject="algebra"
+        for (key, subject, metric_name, current_metric_class), ksm_group in data.groupby(
+            ["key", "subject", "metric_name", "metric_class_name"]
+        ):
+            current_metric = None
+            # now loop over the self.metrics list and find the metric class that matches the current_metric_class
+            for metric_class in self.metrics:
+                if metric_class.__name__ == current_metric_class:
+                    current_metric = metric_class
+                    break
+            if current_metric is None:
+                raise ValueError(f"Metric {metric_name} not found in metrics list. This should never happen.")
+            for aggregator in current_metric.AGGREGATORS:
+                save_string = (
+                    f"{aggregator.name} {metric_name} - {subject}"
+                    if not key
+                    else f"{aggregator.name} {metric_name} - {key} - {subject}"
+                )
+                aggregated_results[save_string] = aggregator(ksm_group, ["prompt"])["value"].mean().mean().item()
+        return aggregated_results
     def run_eval(self) -> list[Result]:
         """Runs evaluation using saved completions."""
         logger.info("Running evaluation...")
@@ -252,6 +314,8 @@ class EvaluationGenerator:
         metrics_results = self._run_metric_calculators(responses)
         aggregated_results = self._aggregate_results(metrics_results)
+        results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
+        aggregated_results.update(results_with_aggregators)
         wandb.log(aggregated_results)
         self.result_processor.save_aggregated_results(aggregated_results)

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/main.py RENAMED Viewed

@@ -66,7 +66,7 @@ def main(
     with wandb.init(
         entity=config.wandb_entity,
         project=config.wandb_project,
-        group=llm.name[:127],
+        group=(config.wandb_group or llm.name)[:127],
         job_type=config.task_name[:63],
         id=wandb_run_id,  # (potentially resuming run after preemption)
         config=response_generator._get_metadata(),

eval_framework-0.3.1/src/eval_framework/metrics/aggregators/aggregators.py ADDED Viewed

@@ -0,0 +1,139 @@
+from typing import Any, Protocol
+import numpy as np
+import pandas as pd
+from scipy.special import comb
+class Aggregator(Protocol):
+    """Base class for metric aggregators.
+    An aggregator collapses multiple evaluation rows for the same problem (i.e. prompt) into a
+    single score per problem. The input DataFrame has one row per (problem, attempt)
+    pair; the output has one row per problem with a new ``value``.
+    Args:
+        response_df: DataFrame where each row is one evaluation attempt. Must contain
+            a ``value`` column (the per-attempt score) and all ``identifier_columns``.
+        identifier_columns: Columns that uniquely identify a problem (e.g. ``["prompt"]``).
+            Rows sharing the same identifier are different attempts at the same problem.
+    Returns:
+        DataFrame with one row per unique problem and a ``value`` column holding
+        the aggregated score. All non-identifier, non-value columns are preserved
+        (typically via ``"first"``).
+    Example input (``identifier_columns=["prompt"]``, 3 attempts per problem):
+        | prompt         | value | subject |
+        |----------------|-------|---------|
+        | "What is 2+2?" |  1.0  | algebra |
+        | "What is 2+2?" |  1.0  | algebra |
+        | "What is 2+2?" |  0.0  | algebra |
+        | "Solve x^2=4"  |  0.0  | algebra |
+        | "Solve x^2=4"  |  1.0  | algebra |
+        | "Solve x^2=4"  |  0.0  | algebra |
+    """
+    name: str
+    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame: ...
+def closed_form_passatk(n: int, c: int, k: int) -> float:
+    """Closed-form pass@k estimator (see HumanEval paper).
+    pass@k = 1 - C(n-c, k) / C(n, k)
+    Given n total samples with c correct, this is the probability that at least one of k
+    randomly chosen samples is correct. The ratio C(n-c,k)/C(n,k) is the chance all k picks
+    are wrong; subtracting from 1 gives success probability. When n-c < k there aren't enough
+    wrong samples to fill k slots, so the result is trivially 1.
+    """
+    if n < k:
+        return 1.0 if c > 0 else 0.0
+    if n - c < k:
+        return 1.0
+    return 1.0 - comb(n - c, k, exact=False) / comb(n, k, exact=False)
+class PassAtK(Aggregator):
+    """Computes pass@k: the probability that at least one of k random attempts is correct.
+    Groups rows by ``identifier_columns``, counts correct (``c = sum(value)``) and
+    total (``n = count(value)``) attempts per problem, then applies the closed-form
+    estimator.
+    Expects ``value`` to be binary (0 or 1). For k=1 this is equivalent to the mean.
+    Example (k=2, continuing from the Aggregator docstring example):
+        "What is 2+2?": n=3, c=2, k=2 -> 1.0  (guaranteed correct pick)
+        "Solve x^2=4":  n=3, c=1,  k=2 -> 0.667 (as computed by the `closed_form_passatk`)
+        Output:
+        | prompt         | value | subject |
+        |----------------|-------|---------|
+        | "What is 2+2?" | 1.000 | algebra |
+        | "Solve x^2=4"  | 0.667 | algebra |
+    """
+    def __init__(self, k: int = 1) -> None:
+        self.k = k
+        self.name = f"Pass@{k}"
+    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
+        # agg_dict decides how each column (`agg_dict` key) will get aggregated (`agg_dict` value).
+        # For the `value` column, we compute both the sum and the count, for all other columns we simply pick the first
+        # entry (as they are identical anyway).
+        other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
+        agg_dict = {"value": ["sum", "count"], **{c: "first" for c in other_cols}}
+        agg = response_df.groupby(identifier_columns).agg(agg_dict)
+        # flatten multi-index columns from value agg: ("value", "sum") / ("value", "count")
+        c = agg[("value", "sum")].values
+        n = agg[("value", "count")].values
+        scores = np.array([closed_form_passatk(n_i, c_i, self.k) for n_i, c_i in zip(n, c)])
+        out = agg.drop(columns=[("value", "sum"), ("value", "count")])
+        if isinstance(out.columns, pd.MultiIndex):
+            out.columns = out.columns.droplevel(1)
+        return out.assign(value=scores).reset_index()
+class IdentifierMean(Aggregator):
+    """Computes the arithmetic mean of ``value`` across attempts per problem.
+    Example (continuing from the Aggregator docstring example):
+        "What is 2+2?": mean(1.0, 1.0, 0.0) = 0.667
+        "Solve x^2=4":  mean(0.0, 1.0, 0.0) = 0.333
+        Output:
+        | prompt         | value | subject |
+        |----------------|-------|---------|
+        | "What is 2+2?" | 0.667 | algebra |
+        | "Solve x^2=4"  | 0.333 | algebra |
+    """
+    def __init__(self) -> None:
+        self.name = "IdentifierMean"
+    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
+        agg_dict = {
+            "value": "mean",
+        }
+        other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
+        agg_dict.update({c: "first" for c in other_cols})
+        return response_df.groupby(identifier_columns).agg(agg_dict).reset_index()
+class Identity:
+    """No-op aggregator — returns the input unchanged.
+    Use for metrics where each row is already a final score and no cross-attempt
+    aggregation is needed (e.g. when ``num_samples=1``).
+    """
+    def __init__(self) -> None:
+        self.name = "Identity"
+    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
+        return response_df

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/base.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Any
 from pydantic import BaseModel, ConfigDict
+from eval_framework.metrics.aggregators.aggregators import Aggregator
 from eval_framework.shared.types import Error
@@ -28,6 +29,10 @@ class classproperty:
 class BaseMetric[Response](ABC):
     NAME: str
     KEYS: list[str] | None = None
+    # The aggregator determines how to aggregate the results of a metric for a single
+    # sample over multiple runs (LLM calls). We default to averaging and thus making
+    # macro averaging the overall computation default.
+    AGGREGATORS: list[Aggregator] = []
     @classproperty
     def NAMES(cls) -> list[str]:

eval_framework-0.3.1/src/eval_framework/metrics/completion/accuracy_completion.py ADDED Viewed

@@ -0,0 +1,116 @@
+import re
+import string
+from typing import Any
+import numpy as np
+from eval_framework.metrics.base import BaseMetric, MetricResult
+from eval_framework.shared.types import Completion
+class AccuracyCompletion(BaseMetric[Completion]):
+    NAME = "Accuracy Completion"
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+        ground_truths = response.ground_truth_list
+        is_correct = any(response.completion == gt for gt in ground_truths)
+        return [
+            MetricResult(metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error)
+        ]
+class AccuracyCompletionWithEvaluate(AccuracyCompletion):
+    def __init__(self, regexes_to_ignore: list[str], ignore_case: bool = False, ignore_punctuation: bool = False):
+        self.regexes_to_ignore = regexes_to_ignore
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+        ground_truths = response.ground_truth_list
+        model_answer = response.completion
+        is_correct = exact_match_hf_evaluate(
+            predictions=[model_answer] * len(ground_truths),
+            references=ground_truths,  # type: ignore[arg-type]
+            regexes_to_ignore=self.regexes_to_ignore,
+            ignore_case=self.ignore_case,
+            ignore_punctuation=self.ignore_punctuation,
+        )["exact_match"]
+        return [
+            MetricResult(metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error)
+        ]
+class AccuracyCompletionOLMES(AccuracyCompletionWithEvaluate):
+    # If we did a functools partial, code fails as there an issubclass check that
+    # doesn't work with partial. These specific regexes are taken from
+    # https://github.com/allenai/olmes/blob/main/oe_eval/tasks/oe_eval_tasks/gsm8k.py#L70
+    def __init__(self) -> None:
+        super().__init__(regexes_to_ignore=[",", "\\$", "(?s).*#### ", "\\.$"])
+# The following code is (largely) reproduced from https://github.com/allenai/olmes/blob/main/oe_eval/dependencies/hf_evaluate/exact_match.py#L25
+# Olmes released under Apache 2.0 license and so is the HF evaluate library.
+# Some cosmetic modifications have been made to fit our codebase and linting rules.
+# -------------------------------------------------------------------------------------
+### Code ported from Huggingface's `evaluate` library at
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+### Port taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/metrics.py used
+### to fix the issue: https://github.com/EleutherAI/lm-evaluation-harness/pull/2045
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions: list[str],
+    references: list[str],
+    regexes_to_ignore: list[str] | None = None,
+    ignore_case: bool = False,
+    ignore_punctuation: bool = False,
+    ignore_numbers: bool = False,
+) -> dict[str, Any]:  # type: ignore
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])  # type: ignore
+            references = np.array([re.sub(s, "", x) for x in references])  # type: ignore
+    else:
+        predictions = np.asarray(predictions)  # type: ignore
+        references = np.asarray(references)  # type: ignore
+    if ignore_case:
+        predictions = np.char.lower(predictions)  # type: ignore
+        references = np.char.lower(references)  # type: ignore
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)  # type: ignore
+        references = np.char.translate(references, table=repl_table)  # type: ignore
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)  # type: ignore
+        references = np.char.translate(references, table=repl_table)  # type: ignore
+    # NOTE: For multiple ground-truths OLMES returns the mean over their scores. The max over
+    # it would be more meaningful, but we leave it here for parity.
+    score_list = predictions == references
+    return {"exact_match": np.mean(score_list)}

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_assertion.py RENAMED Viewed

@@ -12,7 +12,19 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
         # this will always be a list, if return is "" this will be an empty list
         code = response.completion
-        output = run_python_code(code, image="python:3.12-slim")
+        try:
+            output = run_python_code(code, image="python:3.12-slim")
+        except Exception as e:
+            import traceback
+            return [
+                MetricResult(
+                    metric_name=self.NAME,
+                    value=0.0,
+                    higher_is_better=True,
+                    error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
+                )
+            ]
         # Split and filter out empty strings
         output_parts = [part for part in output.split() if part.strip()]

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import importlib.resources
 import traceback
 from collections.abc import Callable
 from typing import Self
@@ -5,8 +6,17 @@ from typing import Self
 from pydantic import Field
 from eval_framework.metrics.base import BaseMetric, MetricResult
-from eval_framework.shared.types import BaseMetricContext, Completion, Error, extract_context_metric
-from eval_framework.tasks.utils import CallableSerializer, ExecutionResult, execute_python_code_with_tests
+from eval_framework.shared.types import (
+    BaseMetricContext,
+    Completion,
+    Error,
+    extract_context_metric,
+)
+from eval_framework.tasks.utils import (
+    CallableSerializer,
+    ExecutionResult,
+    execute_python_code_with_tests,
+)
 class CodeExecutionBaseContext(BaseMetricContext):
@@ -65,7 +75,14 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
     def calculate(self, response: Completion) -> list[MetricResult]:
         if response.error is not None:
-            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+            return [
+                MetricResult(
+                    metric_name=self.NAME,
+                    value=None,
+                    higher_is_better=True,
+                    error=response.error,
+                )
+            ]
         try:
             context = extract_context_metric(response, CodeExecutionPassAtOneContext)
             parsed_context = RealtimeCodeExectionContext.from_context(context)
@@ -76,8 +93,19 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
         try:
             c, output = self._count_correct_samples(response.completion, parsed_context)
         except Exception as e:
-            error = Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc())
-            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=error)]
+            error = Error(
+                error_class=e.__class__.__name__,
+                message=str(e),
+                traceback=traceback.format_exc(),
+            )
+            return [
+                MetricResult(
+                    metric_name=self.NAME,
+                    value=None,
+                    higher_is_better=True,
+                    error=error,
+                )
+            ]
         pass_at_k_value = estimate_pass_at_k(n, c, self.k)
         return [
@@ -90,15 +118,40 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
             )
         ]
+    def _count_correct_samples(self, completion: str, context: RealtimeCodeExectionContext) -> tuple[int, str]:
+        try:
+            result = execute_python_code_with_tests(
+                code=completion,
+                test_code=context.test_code,
+                package_mapping=context.package_downloads,
+                merge_code_fn=context.snippet_merge_fn,
+                image=context.run_env,
+                timeout=context.benchmark_timeout,
+                parse_output_fn=context.output_parse_fn,
+                dockerfile=None,
+            )
+        except Exception as e:
+            return (0, str(e))
+        return (1 if result.success else 0), result.output
+class CodeExecutionPassAtOneWithCodebench(CodeExecutionPassAtOne):
+    NAME = "code-execution-pass@1-codebench"
+    def __init__(self) -> None:
+        super().__init__()
+        self.dockerfile = str(importlib.resources.files("eval_framework.tasks") / "Dockerfile_codebench")
     def _count_correct_samples(self, completion: str, context: RealtimeCodeExectionContext) -> tuple[int, str]:
         result = execute_python_code_with_tests(
             code=completion,
             test_code=context.test_code,
-            package_mapping=context.package_downloads,
+            package_mapping={},  # the docker contains everything
             merge_code_fn=context.snippet_merge_fn,
-            image=context.run_env,
+            image=None,  # dockerfile provided
             timeout=context.benchmark_timeout,
             parse_output_fn=context.output_parse_fn,
+            dockerfile=self.dockerfile,
         )
         return (1 if result.success else 0), result.output

{eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/drop_completion.py RENAMED Viewed

@@ -2,7 +2,11 @@
 from eval_framework.external.drop_process_results import process_results
 from eval_framework.metrics.base import BaseMetric, MetricResult
-from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
+from eval_framework.shared.types import (
+    BaseMetricContext,
+    Completion,
+    extract_context_metric,
+)
 class DropMetricContext(BaseMetricContext):
@@ -20,10 +24,13 @@ class DropF1ExactMatch(BaseMetric[Completion]):
     def calculate(self, response: Completion) -> list[MetricResult]:
         if response.error is not None:
             return [
-                MetricResult(metric_name=f"{self.NAME}/f1", value=None, higher_is_better=True, error=response.error),
                 MetricResult(
-                    metric_name=f"{self.NAME}/exact_match", value=None, higher_is_better=True, error=response.error
-                ),
+                    metric_name=name,
+                    value=None,
+                    higher_is_better=True,
+                    error=response.error,
+                )
+                for name in [n.strip() for n in self.NAME.split("/")]
             ]
         context = extract_context_metric(response, DropMetricContext)
@@ -36,12 +43,14 @@ class DropF1ExactMatch(BaseMetric[Completion]):
             pred_spans = [raw]
         doc = {"answers": answer_tuples}
-        results = [pred_spans]
-        out = process_results(doc, results)
+        out = process_results(doc, pred_spans)
         return [
-            MetricResult(metric_name="DROP F1", value=out["f1"], higher_is_better=True, error=response.error),
             MetricResult(
-                metric_name="Exact Match", value=out["exact_match"], higher_is_better=True, error=response.error
-            ),
+                metric_name=name,
+                value=out[key],
+                higher_is_better=True,
+                error=response.error,
+            )
+            for name, key in zip([n.strip() for n in self.NAME.split("/")], self.KEYS)
         ]

eval-framework 0.2.14__tar.gz → 0.3.1__tar.gz

eval-framework 0.2.14tar.gz → 0.3.1tar.gz