PyPI - eval-framework - Versions diffs - 0.3.8__tar.gz → 0.5.0__tar.gz - Mend

eval-framework 0.3.8tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

{eval_framework-0.3.8 → eval_framework-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.3.8
+Version: 0.5.0
 Summary: Evaluation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
@@ -212,16 +212,15 @@ Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Software Development :: Libraries
 Classifier: Typing :: Typed
 Requires-Dist: pyyaml>=6.0.3,<7
-Requires-Dist: xmltodict>=0.15.1,<0.16
+Requires-Dist: xmltodict>=1.0.4,<1.1
 Requires-Dist: pydantic>=2.13.4,<3
-Requires-Dist: datasets>=4.8.5,<5
+Requires-Dist: datasets>=5.0.0,<6
 Requires-Dist: sacrebleu>=2.6.0,<3
-Requires-Dist: pycountry>=24.6.1,<25
+Requires-Dist: pycountry>=26.2.16,<27
 Requires-Dist: nltk>=3.9.4,<4
 Requires-Dist: python-dotenv>=1.2.2,<2
 Requires-Dist: lingua-language-detector>=2.2.0,<3
 Requires-Dist: google-crc32c>=1.8.0,<2
-Requires-Dist: kubernetes>=31.0.0,<32
 Requires-Dist: langdetect>=1.0.9,<2
 Requires-Dist: spacy>=3.8.14,<4
 Requires-Dist: jsonschema>=4.26.0,<5
@@ -232,18 +231,17 @@ Requires-Dist: llm-sandbox[docker]==0.3.39
 Requires-Dist: jsonlines>=4,<5
 Requires-Dist: lxml>=6.1.1,<7
 Requires-Dist: python-iso639>=2026.4.20
-Requires-Dist: wandb>=0.27.0,<1
-Requires-Dist: boto3>=1.43.18,<2
-Requires-Dist: numpy>=1.26.4
+Requires-Dist: wandb>=0.27.2,<1
+Requires-Dist: boto3>=1.43.19,<2
+Requires-Dist: numpy>=2.2.6
 Requires-Dist: antlr4-python3-runtime==4.11.0
 Requires-Dist: scipy>=1.17.1,<2
 Requires-Dist: accelerate ; extra == 'accelerate'
-Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
+Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
 Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
-Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
 Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
 Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
-Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
+Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
 Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
 Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
 Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
@@ -253,7 +251,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
 Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
 Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
-Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
+Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
 Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
 Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
 Requires-Python: >=3.12, <3.13
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
 Provides-Extra: accelerate
 Provides-Extra: all
 Provides-Extra: api
-Provides-Extra: comet
 Provides-Extra: determined
 Provides-Extra: mistral
 Provides-Extra: openai
@@ -319,7 +316,6 @@ pip install eval_framework
 There are optional extras available to unlock specific features of the library:
 - `api` for inference using the aleph-alpha client.
-- `comet` for the COMET metric.
 - `determined` for running jobs via determined.
 - `mistral` for inference on Mistral models.
 - `transformers` for inference using the transformers library.

{eval_framework-0.3.8 → eval_framework-0.5.0}/README.md RENAMED Viewed

@@ -47,7 +47,6 @@ pip install eval_framework
 There are optional extras available to unlock specific features of the library:
 - `api` for inference using the aleph-alpha client.
-- `comet` for the COMET metric.
 - `determined` for running jobs via determined.
 - `mistral` for inference on Mistral models.
 - `transformers` for inference using the transformers library.

{eval_framework-0.3.8 → eval_framework-0.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "eval-framework"
-version = "0.3.8"
+version = "0.5.0"
 description = "Evaluation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -19,16 +19,15 @@ classifiers = [
 ]
 dependencies = [
   "pyyaml>=6.0.3,<7",
-  "xmltodict>=0.15.1,<0.16",
+  "xmltodict>=1.0.4,<1.1",
   "pydantic>=2.13.4,<3",
-  "datasets>=4.8.5,<5",
+  "datasets>=5.0.0,<6",
   "sacrebleu>=2.6.0,<3",
-  "pycountry>=24.6.1,<25",
+  "pycountry>=26.2.16,<27",
   "nltk>=3.9.4,<4",
   "python-dotenv>=1.2.2,<2",
   "lingua-language-detector>=2.2.0,<3",
   "google-crc32c>=1.8.0,<2",
-  "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
   "langdetect>=1.0.9,<2", # required by the original ifeval implementation
   "spacy>=3.8.14,<4",
   "jsonschema>=4.26.0,<5",
@@ -39,14 +38,13 @@ dependencies = [
   "jsonlines>=4,<5",
   "lxml>=6.1.1,<7",
   "python-iso639>=2026.4.20",
-  "wandb>=0.27.0,<1",
-  "boto3>=1.43.18,<2",
-  "numpy>=1.26.4",
+  "wandb>=0.27.2,<1",
+  "boto3>=1.43.19,<2",
+  "numpy>=2.2.6",
   # is a dependency of sympy, but not explicitly listed in the requirements.txt
   # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
   "antlr4-python3-runtime==4.11.0",
   "scipy>=1.17.1,<2",  # required for the aggregation of pass@k metrics
 ]
 [project.optional-dependencies]
@@ -64,7 +62,7 @@ openai = [
 transformers = [
   "transformers>=4.45.2,<5",
   "torch>=2.5,<3",
-  "accelerate>=0.34.2,<1",
+  "accelerate>=1.14.0,<2",
 ]
 accelerate = ["accelerate"]
 vllm = [
@@ -72,21 +70,17 @@ vllm = [
   "torch>=2.5,<3"
 ]
 mistral = [
-  "mistral-common>=1.11.2,<2",
+  "mistral-common>=1.11.3,<2",
   "huggingface-hub>=0.36.2,<0.37",
   "eval_framework[vllm]",
 ]
-# Benchmark/metric specific extras
-comet = [
-  "unbabel-comet>=2.2.7,<3",
-]
 # from template-formatting
 optional = [
   "transformers>=4.45.2,<5",
   "jinja2>=3.1.6,<4"
 ]
 all = [
-  "eval_framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral]"
+  "eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
 ]
 [project.urls]
@@ -98,15 +92,15 @@ eval_framework = "eval_framework.run:run"
 [dependency-groups]
 dev = [
   "mypy>=2.1.0,<3",
-  "pytest>=9.0.3,<10",
+  "pytest>=9.1.0,<10",
   "pytest-mock>=3.15.1",
   "pytest-xdist>=3.8.0,<4",
   "pytest-sugar>1.1,<2",
   "types-pyyaml>=6.0.12.20260518,<7",
   "types-python-dateutil>=2.9.0.20260518,<3",
   "types-requests>=2.33.0.20260518,<3",
-  "plotly>=5.24.1,<6",
-  "ruff>=0.15.15",
+  "plotly>=6.8.0,<7",
+  "ruff>=0.15.18",
   "pip-licenses>=5.5.5",
 ]
 flash-attn = [
@@ -115,7 +109,7 @@ flash-attn = [
 ]
 [build-system]
-requires = ["uv_build>=0.11.17,<0.11.18"]
+requires = ["uv_build>=0.11.22,<0.11.23"]
 build-backend = "uv_build"
 [tool.uv.build-backend]
@@ -126,22 +120,6 @@ override-dependencies = [
   "requests>=2.32,<3",  # fix for determined
 ]
-[tool.uv.sources]
-torch = [
-  { index = "pytorch-default", marker = "sys_platform != 'linux'" },
-  { index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
-]
-[[tool.uv.index]]
-name = "pytorch-cu124"
-url = "https://download.pytorch.org/whl/cu124"
-explicit = true
-[[tool.uv.index]]
-name = "pytorch-default"
-url = "https://pypi.org/simple"
-explicit = true
 [tool.uv.extra-build-dependencies]
 # Build flash-attn with the same torch version as in the container. Details at:
 # https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
@@ -167,6 +145,7 @@ known-third-party = ["wandb"]
 [tool.ruff.lint.extend-per-file-ignores]
 "__init__.py" = ["F401"]
+"tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
 [tool.mypy]
 plugins = "pydantic.mypy"

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/openai.py RENAMED Viewed

@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
         formatter: BaseFormatter | None = None,
         temperature: float | None = None,
         top_p: float | None = None,
-        api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
+        api_key: str | None = None,
         organization: str | None = None,
         base_url: str | None = None,
         bytes_per_token: float | None = None,
@@ -86,7 +86,7 @@ class OpenAIModel(BaseLLM):
         self._top_p = top_p
         self._client = OpenAI(
-            api_key=api_key,
+            api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
             organization=organization,
             base_url=base_url,
         )

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/base.py RENAMED Viewed

@@ -36,7 +36,7 @@ class BaseMetric[Response](ABC):
     # macro averaging the overall computation default.
     AGGREGATORS: list[Aggregator] = []
     # Set by the evaluation generator before calculate(); controls how infra failures are handled.
-    fail_on_error: bool = False
+    fail_on_error: bool = True
     @classproperty
     def NAMES(cls) -> list[str]:

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_assertion.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from llm_sandbox.exceptions import SandboxTimeoutError
 from eval_framework.metrics.base import BaseMetric, MetricResult
-from eval_framework.shared.types import Completion, Error
+from eval_framework.shared.types import Completion
 from eval_framework.tasks.utils import run_python_code
@@ -16,7 +16,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
         code = response.completion
         try:
             output = run_python_code(code, image="python:3.12-slim")
-        except SandboxTimeoutError as e:
+        except SandboxTimeoutError:
             # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
             # problem.
             import traceback
@@ -26,7 +26,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
                     metric_name=self.NAME,
                     value=0.0,
                     higher_is_better=True,
-                    error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
+                    code_execution_trace=traceback.format_exc(),
                 )
             ]
         except Exception as e:
@@ -42,22 +42,12 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
             last_output = output_parts[-1]
         success = last_output == "True"
-        error = (
-            None
-            if success
-            else Error(
-                error_class="CodeCompletionAssertionError",
-                message=f"Expected 'True' but got '{last_output}'",
-                traceback=output,
-            )
-        )
         return [
             MetricResult(
                 metric_name=self.NAME,
                 value=1.0 if success else 0.0,
                 higher_is_better=True,
-                error=error,
+                error=None,
                 code_execution_trace=output,
             )
         ]

eval_framework-0.5.0/src/eval_framework/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# Register all tasks on import
+from pathlib import Path
+from .dataset_revisions import DatasetRevision
+from .task_names import register_all_tasks
+DatasetRevision.add_revision_file(Path(__file__).parent / "task-dataset-revisions.json")
+register_all_tasks()
+del register_all_tasks
+del DatasetRevision

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/base.py RENAMED Viewed

@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
 from pydantic import BaseModel, ConfigDict
 from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
-from eval_framework.tasks.benchmarks.dataset_revisions import get_pinned_dataset_revision
+from eval_framework.tasks.dataset_revisions import DatasetRevision
 from eval_framework.tasks.utils import classproperty, raise_errors
 from template_formatting.formatter import Message, Role
@@ -118,7 +118,7 @@ class BaseTask[SubjectType](ABC):
         # Applied once at instance creation; not refreshed if the pin file changes mid-run.
         if custom_hf_revision:
             self.HF_REVISION = custom_hf_revision
-        elif self.HF_REVISION is None and (pinned := get_pinned_dataset_revision(self.__class__.__name__)):
+        elif self.HF_REVISION is None and (pinned := DatasetRevision.pinned_revision(self.__class__.__name__)):
             self.HF_REVISION = pinned
     @classmethod
@@ -359,7 +359,7 @@ class BaseTask[SubjectType](ABC):
         samples: list[Sample],
         stop_sequences: list[str] | None = None,
         max_tokens: int | None = None,
-        fail_on_error: bool = False,
+        fail_on_error: bool = True,
     ) -> list[Completion]:
         """
         Generates completions for the sample.

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/aidanbench.py RENAMED Viewed

@@ -109,7 +109,7 @@ class AidanBenchOriginal(BaseTask[str]):
         stop_sequences: list[str] | None,
         max_tokens: int | None,
         initial_samples: list[Sample],
-        fail_on_error: bool = False,
+        fail_on_error: bool = True,
     ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
         initial_messages = [s.messages for s in initial_samples]
         samples = [(s, False) for s in initial_samples]  # (sample, is_done)
@@ -170,7 +170,7 @@ class AidanBenchOriginal(BaseTask[str]):
         samples: list[Sample],
         stop_sequences: list[str] | None = None,
         max_tokens: int | None = None,
-        fail_on_error: bool = False,
+        fail_on_error: bool = True,
     ) -> list[Completion]:
         assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
             "Each sample must have exactly one USER message."

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores200.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 import random
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 import pycountry
 from datasets import DatasetDict, DownloadConfig, load_dataset
@@ -100,11 +100,11 @@ class Flores200(BaseTask[str]):
     def _get_instruction_text(self, item: dict[str, Any]) -> str:
         source_key = item["subject"].split("-")[0]
-        source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
+        source_language = cast(Any, pycountry.languages.get(alpha_3=source_key.split("_")[0])).name
         source = item[f"sentence_{source_key}"]
         instruction = f"{source_language} sentence: {source}\n"
         target_key = item["subject"].split("-")[1]
-        target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
+        target_language = cast(Any, pycountry.languages.get(alpha_3=target_key.split("_")[0])).name
         return f"{instruction}{target_language} sentence:"

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores_plus.py RENAMED Viewed

@@ -4,7 +4,6 @@ from typing import Any
 from eval_framework.metrics.completion.bleu import BLEU
 from eval_framework.metrics.completion.chrf import CHRF
-from eval_framework.metrics.completion.comet import COMET
 from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
 from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
@@ -29,7 +28,7 @@ class FloresPlus(BaseTask[str]):
     SAMPLE_SPLIT = "dev"
     FEWSHOT_SPLIT = "devtest"
     RESPONSE_TYPE = ResponseType.COMPLETION
-    METRICS = [BLEU, CHRF, COMET]
+    METRICS = [BLEU, CHRF]
     SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
     PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
     LANGUAGE = {

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gsm8k.py RENAMED Viewed

@@ -95,7 +95,6 @@ class GSM8KEvalHarness(BaseTask[str]):
     NAME = "GSM8KEvalHarness"
     DATASET_PATH = "openai/gsm8k"
-    HF_REVISION = "main"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
     RESPONSE_TYPE = ResponseType.COMPLETION

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/squad.py RENAMED Viewed

@@ -236,6 +236,27 @@ class SQUAD(SQUAD2):
         return item["answers"]["text"]
+class SQuAD2_MA(SQUAD2):
+    """SQuAD v2 with the exact system prompt used in MA training"""
+    NAME = "SQuAD2_MA"
+    UNANSWERABLE_STR = "unanswerable"
+    METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
+        return (
+            "You are a helpful assistant and will answer the user's questions carefully, "
+            "logically, accurately and well-reasoned.\n"
+            "Use the given context to answer the question faithfully. Answer only if the "
+            f"answer is present in the given context, otherwise respond with '{self.UNANSWERABLE_STR}' "
+            "if the answer is not present in the context."
+        )
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Context:\n{item['context']}\n\nQuestion:\n{item['question']}\n"
 class SQuAD_OLMES(SQUAD):
     """SQuAD variant matching OLMES implementation."""

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/triviaqa.py RENAMED Viewed

@@ -2,7 +2,7 @@ import random
 from typing import Any
 from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
-from eval_framework.metrics.completion.f1 import F1
+from eval_framework.metrics.completion.f1 import F1, F1SquadNormalized
 from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
@@ -40,3 +40,29 @@ class TRIVIAQA(BaseTask[str]):
     def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
         return completion_text.strip().rstrip(".")
+class TriviaQA_MA(TRIVIAQA):
+    """TriviaQA with the exact system prompt used in MA training"""
+    NAME = "TriviaQA_MA"
+    SUBJECTS = ["rc.wikipedia"]
+    UNANSWERABLE_STR = "unanswerable"
+    METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
+    def _get_context_text(self, item: dict[str, Any]) -> str:
+        return "\n\n".join(item["entity_pages"]["wiki_context"])
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
+        return (
+            "You are a helpful assistant and will answer the user's questions carefully, "
+            "logically, accurately and well-reasoned.\n"
+            "Use the given context to answer the question faithfully. Answer only if the "
+            f"answer is present in the given context, otherwise respond with '{self.UNANSWERABLE_STR}' "
+            "if the answer is not present in the context."
+        )
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Context:\n{self._get_context_text(item)}\n\nQuestion:\n{item['question'].strip()}\n"

{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/wmt.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import random
 from abc import ABC
-from typing import Any
+from typing import Any, cast
 import pycountry
 import sacrebleu
@@ -38,7 +38,7 @@ class WMT(BaseTask[str], ABC):
     def _code_to_language(self, code: str) -> str:
         # key is alpha_2 or alpha_3 depending on the code length
         key = f"alpha_{len(code)}"
-        language_tuple = pycountry.languages.get(**{key: code})
+        language_tuple = cast(Any, pycountry.languages.get(**{key: code}))
         return language_tuple.name
     def _get_instruction_text(self, item: dict[str, Any]) -> str:

{eval_framework-0.3.8/src/eval_framework/tasks/benchmarks → eval_framework-0.5.0/src/eval_framework/tasks}/dataset_revisions.py RENAMED Viewed

@@ -25,12 +25,35 @@ def _pinned_revisions(revisions_file: Path) -> dict[str, str]:
     return json.loads(revisions_file.read_text(encoding="utf-8"))
-def get_pinned_dataset_revision(
-    task_class_name: str,
-    *,
-    revisions_file: Path | None = None,
-) -> str | None:
-    return _pinned_revisions(revisions_file or REVISIONS_FILE).get(task_class_name)
+class DatasetRevision:
+    _INSTANCE: "DatasetRevision | None" = None
+    def __init__(self) -> None:
+        self._cache: dict[str, str] = {}
+    @classmethod
+    def _get_instance(cls) -> "DatasetRevision":
+        if cls._INSTANCE is None:
+            cls._INSTANCE = cls()
+        return cls._INSTANCE
+    @classmethod
+    def add_revision_file(cls, file_path: Path | str) -> None:
+        instance = cls._get_instance()
+        instance._append_revision_file(Path(file_path))
+    @classmethod
+    def pinned_revision(cls, task_class_name: str) -> str | None:
+        return cls._get_instance()._cache.get(task_class_name)
+    @classmethod
+    def reset(cls) -> None:
+        # for unit tests only.
+        cls._INSTANCE = None
+    def _append_revision_file(self, file_path: Path) -> None:
+        revisions = _pinned_revisions(file_path)
+        self._cache |= revisions
 def _repo_sha(api: HfApi, repo_id: str, cache: dict[str, str | None]) -> str | None:
@@ -73,7 +96,7 @@ def main() -> None:
     revisions = collect_dataset_revisions(registered_task_names(), HfApi())
     REVISIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
     REVISIONS_FILE.write_text(
-        json.dumps(dict(sorted(revisions.items())), indent=2, ensure_ascii=False) + "\n",
+        json.dumps(dict(sorted(revisions.items())), indent=4, ensure_ascii=False) + "\n",
         encoding="utf-8",
     )
     logger.info("Wrote %d revisions to %s", len(revisions), REVISIONS_FILE)

eval-framework 0.3.8__tar.gz → 0.5.0__tar.gz

eval-framework 0.3.8tar.gz → 0.5.0tar.gz