PyPI - eval-framework - Versions diffs - 0.3.8__tar.gz → 0.5.1__tar.gz - Mend

eval-framework 0.3.8tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

{eval_framework-0.3.8 → eval_framework-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.3.8
+Version: 0.5.1
 Summary: Evaluation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
@@ -212,16 +212,15 @@ Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Software Development :: Libraries
 Classifier: Typing :: Typed
 Requires-Dist: pyyaml>=6.0.3,<7
-Requires-Dist: xmltodict>=0.15.1,<0.16
+Requires-Dist: xmltodict>=1.0.4,<1.1
 Requires-Dist: pydantic>=2.13.4,<3
-Requires-Dist: datasets>=4.8.5,<5
+Requires-Dist: datasets>=5.0.0,<6
 Requires-Dist: sacrebleu>=2.6.0,<3
-Requires-Dist: pycountry>=24.6.1,<25
+Requires-Dist: pycountry>=26.2.16,<27
 Requires-Dist: nltk>=3.9.4,<4
 Requires-Dist: python-dotenv>=1.2.2,<2
 Requires-Dist: lingua-language-detector>=2.2.0,<3
 Requires-Dist: google-crc32c>=1.8.0,<2
-Requires-Dist: kubernetes>=31.0.0,<32
 Requires-Dist: langdetect>=1.0.9,<2
 Requires-Dist: spacy>=3.8.14,<4
 Requires-Dist: jsonschema>=4.26.0,<5
@@ -232,18 +231,17 @@ Requires-Dist: llm-sandbox[docker]==0.3.39
 Requires-Dist: jsonlines>=4,<5
 Requires-Dist: lxml>=6.1.1,<7
 Requires-Dist: python-iso639>=2026.4.20
-Requires-Dist: wandb>=0.27.0,<1
-Requires-Dist: boto3>=1.43.18,<2
-Requires-Dist: numpy>=1.26.4
+Requires-Dist: wandb>=0.27.2,<1
+Requires-Dist: boto3>=1.43.19,<2
+Requires-Dist: numpy>=2.2.6
 Requires-Dist: antlr4-python3-runtime==4.11.0
-Requires-Dist: scipy>=1.17.1,<2
+Requires-Dist: scipy>=1.18.0,<2
 Requires-Dist: accelerate ; extra == 'accelerate'
-Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
+Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
 Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
-Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
 Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
 Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
-Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
+Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
 Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
 Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
 Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
@@ -253,7 +251,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
 Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
 Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
-Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
+Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
 Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
 Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
 Requires-Python: >=3.12, <3.13
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
 Provides-Extra: accelerate
 Provides-Extra: all
 Provides-Extra: api
-Provides-Extra: comet
 Provides-Extra: determined
 Provides-Extra: mistral
 Provides-Extra: openai
@@ -319,7 +316,6 @@ pip install eval_framework
 There are optional extras available to unlock specific features of the library:
 - `api` for inference using the aleph-alpha client.
-- `comet` for the COMET metric.
 - `determined` for running jobs via determined.
 - `mistral` for inference on Mistral models.
 - `transformers` for inference using the transformers library.

{eval_framework-0.3.8 → eval_framework-0.5.1}/README.md RENAMED Viewed

@@ -47,7 +47,6 @@ pip install eval_framework
 There are optional extras available to unlock specific features of the library:
 - `api` for inference using the aleph-alpha client.
-- `comet` for the COMET metric.
 - `determined` for running jobs via determined.
 - `mistral` for inference on Mistral models.
 - `transformers` for inference using the transformers library.

{eval_framework-0.3.8 → eval_framework-0.5.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "eval-framework"
-version = "0.3.8"
+version = "0.5.1"
 description = "Evaluation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -19,16 +19,15 @@ classifiers = [
 ]
 dependencies = [
   "pyyaml>=6.0.3,<7",
-  "xmltodict>=0.15.1,<0.16",
+  "xmltodict>=1.0.4,<1.1",
   "pydantic>=2.13.4,<3",
-  "datasets>=4.8.5,<5",
+  "datasets>=5.0.0,<6",
   "sacrebleu>=2.6.0,<3",
-  "pycountry>=24.6.1,<25",
+  "pycountry>=26.2.16,<27",
   "nltk>=3.9.4,<4",
   "python-dotenv>=1.2.2,<2",
   "lingua-language-detector>=2.2.0,<3",
   "google-crc32c>=1.8.0,<2",
-  "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
   "langdetect>=1.0.9,<2", # required by the original ifeval implementation
   "spacy>=3.8.14,<4",
   "jsonschema>=4.26.0,<5",
@@ -39,14 +38,13 @@ dependencies = [
   "jsonlines>=4,<5",
   "lxml>=6.1.1,<7",
   "python-iso639>=2026.4.20",
-  "wandb>=0.27.0,<1",
-  "boto3>=1.43.18,<2",
-  "numpy>=1.26.4",
+  "wandb>=0.27.2,<1",
+  "boto3>=1.43.19,<2",
+  "numpy>=2.2.6",
   # is a dependency of sympy, but not explicitly listed in the requirements.txt
   # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
   "antlr4-python3-runtime==4.11.0",
-  "scipy>=1.17.1,<2",  # required for the aggregation of pass@k metrics
+  "scipy>=1.18.0,<2",  # required for the aggregation of pass@k metrics
 ]
 [project.optional-dependencies]
@@ -64,7 +62,7 @@ openai = [
 transformers = [
   "transformers>=4.45.2,<5",
   "torch>=2.5,<3",
-  "accelerate>=0.34.2,<1",
+  "accelerate>=1.14.0,<2",
 ]
 accelerate = ["accelerate"]
 vllm = [
@@ -72,21 +70,17 @@ vllm = [
   "torch>=2.5,<3"
 ]
 mistral = [
-  "mistral-common>=1.11.2,<2",
+  "mistral-common>=1.11.3,<2",
   "huggingface-hub>=0.36.2,<0.37",
   "eval_framework[vllm]",
 ]
-# Benchmark/metric specific extras
-comet = [
-  "unbabel-comet>=2.2.7,<3",
-]
 # from template-formatting
 optional = [
   "transformers>=4.45.2,<5",
   "jinja2>=3.1.6,<4"
 ]
 all = [
-  "eval_framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral]"
+  "eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
 ]
 [project.urls]
@@ -98,24 +92,24 @@ eval_framework = "eval_framework.run:run"
 [dependency-groups]
 dev = [
   "mypy>=2.1.0,<3",
-  "pytest>=9.0.3,<10",
+  "pytest>=9.1.0,<10",
   "pytest-mock>=3.15.1",
   "pytest-xdist>=3.8.0,<4",
   "pytest-sugar>1.1,<2",
   "types-pyyaml>=6.0.12.20260518,<7",
   "types-python-dateutil>=2.9.0.20260518,<3",
   "types-requests>=2.33.0.20260518,<3",
-  "plotly>=5.24.1,<6",
-  "ruff>=0.15.15",
+  "plotly>=6.8.0,<7",
+  "ruff>=0.15.18",
   "pip-licenses>=5.5.5",
 ]
 flash-attn = [
-  "flash-attn>=2.8.3,<2.9",
+  "flash-attn>=2.8.3.post1,<2.9",
   "torch"
 ]
 [build-system]
-requires = ["uv_build>=0.11.17,<0.11.18"]
+requires = ["uv_build>=0.11.23,<0.11.24"]
 build-backend = "uv_build"
 [tool.uv.build-backend]
@@ -126,22 +120,6 @@ override-dependencies = [
   "requests>=2.32,<3",  # fix for determined
 ]
-[tool.uv.sources]
-torch = [
-  { index = "pytorch-default", marker = "sys_platform != 'linux'" },
-  { index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
-]
-[[tool.uv.index]]
-name = "pytorch-cu124"
-url = "https://download.pytorch.org/whl/cu124"
-explicit = true
-[[tool.uv.index]]
-name = "pytorch-default"
-url = "https://pypi.org/simple"
-explicit = true
 [tool.uv.extra-build-dependencies]
 # Build flash-attn with the same torch version as in the container. Details at:
 # https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
@@ -167,6 +145,7 @@ known-third-party = ["wandb"]
 [tool.ruff.lint.extend-per-file-ignores]
 "__init__.py" = ["F401"]
+"tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
 [tool.mypy]
 plugins = "pydantic.mypy"

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/evaluation_generator.py RENAMED Viewed

@@ -18,7 +18,7 @@ from eval_framework.result_processors.base import Result, ResultProcessor
 from eval_framework.shared.types import Completion, Loglikelihood
 from eval_framework.tasks.base import ResponseType
 from eval_framework.tasks.eval_config import EvalConfig
-from eval_framework.tasks.registry import get_task
+from eval_framework.tasks.registry import registry
 from eval_framework.utils.constants import RED, RESET
 from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
@@ -36,13 +36,9 @@ class EvaluationGenerator:
         self.result_processor = result_processor
         self.save_intermediate_results = config.save_intermediate_results
-        task_class = get_task(config.task_name)
-        if hasattr(task_class, "TASK_STYLER"):
-            response_type = task_class.TASK_STYLER.response_type
-            task_metrics = list(task_class.TASK_STYLER.metrics)
-        else:
-            response_type = task_class.RESPONSE_TYPE
-            task_metrics = task_class.METRICS
+        eval_ = registry()[config.task_name]
+        response_type = eval_.response_type()
+        task_metrics = eval_.metrics()
         if response_type == ResponseType.COMPLETION:
             self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
@@ -51,7 +47,7 @@ class EvaluationGenerator:
         else:
             raise NotImplementedError
-        self.task_name = task_class.NAME
+        self.task_name = eval_.task_class().NAME
     def _run_metric_calculators(self, responses: list[Completion | Loglikelihood]) -> list[Result]:
         results: list[Result] = self.result_processor.load_metrics_results()

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/openai.py RENAMED Viewed

@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
         formatter: BaseFormatter | None = None,
         temperature: float | None = None,
         top_p: float | None = None,
-        api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
+        api_key: str | None = None,
         organization: str | None = None,
         base_url: str | None = None,
         bytes_per_token: float | None = None,
@@ -86,7 +86,7 @@ class OpenAIModel(BaseLLM):
         self._top_p = top_p
         self._client = OpenAI(
-            api_key=api_key,
+            api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
             organization=organization,
             base_url=base_url,
         )

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/base.py RENAMED Viewed

@@ -36,7 +36,7 @@ class BaseMetric[Response](ABC):
     # macro averaging the overall computation default.
     AGGREGATORS: list[Aggregator] = []
     # Set by the evaluation generator before calculate(); controls how infra failures are handled.
-    fail_on_error: bool = False
+    fail_on_error: bool = True
     @classproperty
     def NAMES(cls) -> list[str]:

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_assertion.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from llm_sandbox.exceptions import SandboxTimeoutError
 from eval_framework.metrics.base import BaseMetric, MetricResult
-from eval_framework.shared.types import Completion, Error
+from eval_framework.shared.types import Completion
 from eval_framework.tasks.utils import run_python_code
@@ -16,7 +16,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
         code = response.completion
         try:
             output = run_python_code(code, image="python:3.12-slim")
-        except SandboxTimeoutError as e:
+        except SandboxTimeoutError:
             # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
             # problem.
             import traceback
@@ -26,7 +26,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
                     metric_name=self.NAME,
                     value=0.0,
                     higher_is_better=True,
-                    error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
+                    code_execution_trace=traceback.format_exc(),
                 )
             ]
         except Exception as e:
@@ -42,22 +42,12 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
             last_output = output_parts[-1]
         success = last_output == "True"
-        error = (
-            None
-            if success
-            else Error(
-                error_class="CodeCompletionAssertionError",
-                message=f"Expected 'True' but got '{last_output}'",
-                traceback=output,
-            )
-        )
         return [
             MetricResult(
                 metric_name=self.NAME,
                 value=1.0 if success else 0.0,
                 higher_is_better=True,
-                error=error,
+                error=None,
                 code_execution_trace=output,
             )
         ]

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/response_generator.py RENAMED Viewed

@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
 from datetime import UTC, datetime
 from functools import partial
-from eval_framework.tasks.registry import get_task
+from eval_framework.tasks.registry import registry
 try:
     from determined._info import get_cluster_info
@@ -28,7 +28,6 @@ from eval_framework.shared.types import (
 )
 from eval_framework.tasks.base import Language, ResponseType, Sample
 from eval_framework.tasks.eval_config import EvalConfig
-from eval_framework.tasks.perturbation import create_perturbation_class
 from eval_framework.tasks.utils import raise_errors
 from eval_framework.utils.constants import RED, RESET
 from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
@@ -54,7 +53,6 @@ def map_language_to_value(
 class ResponseGenerator:
     def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFileProcessor) -> None:
-        self.few_shot = config.num_fewshot
         self.task_name = config.task_name
         self.llm = llm
         self.config = config
@@ -62,20 +60,16 @@ class ResponseGenerator:
         self.num_samples = config.num_samples
         self.save_intermediate_results = config.save_intermediate_results
-        task_class = get_task(config.task_name)
         if config.perturbation_config is not None:
-            perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
-            self.task = perturbation_task_class.with_overwrite(
-                self.few_shot,
-                custom_subjects=self.config.task_subjects,
-                custom_hf_revision=self.config.hf_revision,
+            self.task = registry()[config.task_name].create_perturbation(
+                config.perturbation_config,
+                config.num_fewshot,
+                config.task_subjects,
+                config.hf_revision,
             )
         else:
-            self.task = task_class.with_overwrite(
-                self.few_shot,
-                custom_subjects=self.config.task_subjects,
-                custom_hf_revision=self.config.hf_revision,
+            self.task = registry()[config.task_name].create(
+                config.num_fewshot, config.task_subjects, config.hf_revision
             )
         self.response_type = self.task.get_response_type()

eval_framework-0.5.1/src/eval_framework/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# Register all tasks on import
+from pathlib import Path
+from .dataset_revisions import DatasetRevision
+from .task_names import register_all_tasks
+DatasetRevision.add_revision_file(Path(__file__).parent / "task-dataset-revisions.json")
+register_all_tasks()
+del register_all_tasks
+del DatasetRevision

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/base.py RENAMED Viewed

@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
 from pydantic import BaseModel, ConfigDict
 from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
-from eval_framework.tasks.benchmarks.dataset_revisions import get_pinned_dataset_revision
+from eval_framework.tasks.dataset_revisions import DatasetRevision
 from eval_framework.tasks.utils import classproperty, raise_errors
 from template_formatting.formatter import Message, Role
@@ -118,7 +118,7 @@ class BaseTask[SubjectType](ABC):
         # Applied once at instance creation; not refreshed if the pin file changes mid-run.
         if custom_hf_revision:
             self.HF_REVISION = custom_hf_revision
-        elif self.HF_REVISION is None and (pinned := get_pinned_dataset_revision(self.__class__.__name__)):
+        elif self.HF_REVISION is None and (pinned := DatasetRevision.pinned_revision(self.__class__.__name__)):
             self.HF_REVISION = pinned
     @classmethod
@@ -359,7 +359,7 @@ class BaseTask[SubjectType](ABC):
         samples: list[Sample],
         stop_sequences: list[str] | None = None,
         max_tokens: int | None = None,
-        fail_on_error: bool = False,
+        fail_on_error: bool = True,
     ) -> list[Completion]:
         """
         Generates completions for the sample.

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/aidanbench.py RENAMED Viewed

@@ -109,7 +109,7 @@ class AidanBenchOriginal(BaseTask[str]):
         stop_sequences: list[str] | None,
         max_tokens: int | None,
         initial_samples: list[Sample],
-        fail_on_error: bool = False,
+        fail_on_error: bool = True,
     ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
         initial_messages = [s.messages for s in initial_samples]
         samples = [(s, False) for s in initial_samples]  # (sample, is_done)
@@ -170,7 +170,7 @@ class AidanBenchOriginal(BaseTask[str]):
         samples: list[Sample],
         stop_sequences: list[str] | None = None,
         max_tokens: int | None = None,
-        fail_on_error: bool = False,
+        fail_on_error: bool = True,
     ) -> list[Completion]:
         assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
             "Each sample must have exactly one USER message."

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores200.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 import random
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 import pycountry
 from datasets import DatasetDict, DownloadConfig, load_dataset
@@ -100,11 +100,11 @@ class Flores200(BaseTask[str]):
     def _get_instruction_text(self, item: dict[str, Any]) -> str:
         source_key = item["subject"].split("-")[0]
-        source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
+        source_language = cast(Any, pycountry.languages.get(alpha_3=source_key.split("_")[0])).name
         source = item[f"sentence_{source_key}"]
         instruction = f"{source_language} sentence: {source}\n"
         target_key = item["subject"].split("-")[1]
-        target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
+        target_language = cast(Any, pycountry.languages.get(alpha_3=target_key.split("_")[0])).name
         return f"{instruction}{target_language} sentence:"

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores_plus.py RENAMED Viewed

@@ -4,7 +4,6 @@ from typing import Any
 from eval_framework.metrics.completion.bleu import BLEU
 from eval_framework.metrics.completion.chrf import CHRF
-from eval_framework.metrics.completion.comet import COMET
 from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
 from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
@@ -29,7 +28,7 @@ class FloresPlus(BaseTask[str]):
     SAMPLE_SPLIT = "dev"
     FEWSHOT_SPLIT = "devtest"
     RESPONSE_TYPE = ResponseType.COMPLETION
-    METRICS = [BLEU, CHRF, COMET]
+    METRICS = [BLEU, CHRF]
     SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
     PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
     LANGUAGE = {

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gsm8k.py RENAMED Viewed

@@ -4,6 +4,7 @@ from typing import Any
 from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion, AccuracyCompletionOLMES
 from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+from eval_framework.tasks.task_style import BPBStyle
 logger = logging.getLogger(__name__)
@@ -95,7 +96,6 @@ class GSM8KEvalHarness(BaseTask[str]):
     NAME = "GSM8KEvalHarness"
     DATASET_PATH = "openai/gsm8k"
-    HF_REVISION = "main"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
     RESPONSE_TYPE = ResponseType.COMPLETION
@@ -216,3 +216,32 @@ class GSM8K_OLMES(GSM8K):
     def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
         return self._clean_short_answer(completion_text)
+class GSM8KBPB(GSM8K_OLMES):
+    NAME = "GSM8KBPB"
+    TASK_STYLER = BPBStyle(cue_text="Answer:", leading_space_continuations=False)
+    # BPBStyle already adds "Answer:" as that separate assistant message. But the methods we inherit
+    # still put "Answer:" at the end of the question text and leave it out of the fewshot answer.
+    # So we override them here: remove "Answer:" from the question, and add it back in front of the
+    # fewshot answer. Without this, the question ends in "Answer:Answer:" and fewshot answers have
+    # no "Answer:" label at all.
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Question: {item['question']}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        return f"Answer:{self.normalize_answer_str(item)}"
+    def _get_raw_question(self, item: dict[str, Any]) -> str:
+        return item["question"]
+    def _get_choices(self, item: dict[str, Any]) -> list[str]:
+        return [self.normalize_answer_str(item)]
+    def _get_correct_index(self, item: dict[str, Any]) -> int:
+        return 0
+    def _get_ground_truth(self, item: dict[str, Any]) -> str:
+        return self._get_choices(item)[0]

{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py RENAMED Viewed

@@ -14,8 +14,8 @@ from eval_framework.metrics.completion.minerva_math_utils import (
     extract_answers,
     normalized_gold_from_solution,
 )
-from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
 from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
+from eval_framework.tasks.task_style import BPBStyle
 # Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
 MATH_SUBJECTS = [
@@ -612,44 +612,6 @@ class MATH500Minerva(MATHMinerva):
         super().__init__(num_fewshot)
-class MATHMinervaBPB(MATHReasoning):
-    """
-    MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the
-    gold answer string (bits-per-byte).
-    Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt).
-    """
-    NAME = "MATHMinervaBPB"
-    DATASET_PATH = "EleutherAI/hendrycks_math"
-    SAMPLE_SPLIT = "test"
-    FEWSHOT_SPLIT = "train"
-    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
-    METRICS = [BitsPerByteLoglikelihood]
-    SUBJECTS = MATH_SUBJECTS
-    LANGUAGE = Language.ENG
-    def _get_instruction_text(self, item: dict[str, Any]) -> str:
-        return "Problem:\n" + item["problem"] + "\n\n" + "Solution:"
-    def _get_cue_text(self, item: dict[str, Any]) -> str:
-        return ""
-    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
-        normalized = self._normalized_gold_from_solution(item["solution"])
-        if normalized is None:
-            return None
-        return " " + normalized
-    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
-        normalized = self._normalized_gold_from_solution(item["solution"])
-        if normalized is None:
-            return None
-        return [" " + normalized]
-    def _normalized_gold_from_solution(self, solution: str) -> str | None:
-        return normalized_gold_from_solution(solution)
 class MATHLvl5(MATH):
     NAME = "Math Lvl 5"
@@ -742,7 +704,7 @@ Answer:"""
 _OLMES_FEWSHOTS = [
-    ## https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
+    # https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
     {
         "problem": "Find the domain of the expression  $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
         "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so "
@@ -790,3 +752,35 @@ class MATHMinerva_OLMES(MATHMinerva):
     def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
         return _OLMES_FEWSHOTS[: self.num_fewshot]
+class MATHMinervaBPB(MATHMinerva_OLMES):
+    NAME = "MATHMinervaBPB"
+    TASK_STYLER = BPBStyle(cue_text="Solution:")
+    # BPBStyle already adds "Solution:" as that separate assistant message. But the methods we inherit
+    # still put "Solution:" at the end of the question text and leave it out of the fewshot answer.
+    # So we override them here: remove "Solution:" from the question, and add it back in front of the
+    # fewshot answer. Without this, the question ends in "Solution:Solution:" and fewshot answers have
+    # no "Solution:" label at all.
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return "Problem:\n" + item["problem"] + "\n\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        return f"Solution: {item['solution']}"
+    def _get_choices(self, item: dict[str, Any]) -> list[str]:
+        answer = normalized_gold_from_solution(item["solution"])
+        template = f"\nFinal Answer: The final answer is {answer}. I hope it is correct."
+        return [item["solution"] + template]
+    def _get_correct_index(self, item: dict[str, Any]) -> int:
+        return 0
+    def _get_raw_question(self, item: dict[str, Any]) -> str:
+        return item["problem"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
+        return self._get_choices(item)[0]

eval-framework 0.3.8__tar.gz → 0.5.1__tar.gz

eval-framework 0.3.8tar.gz → 0.5.1tar.gz